diff options
Diffstat (limited to 'intern/cycles')
259 files changed, 16290 insertions, 5429 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 806a8660e8c..c53a9f91cc0 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -191,7 +191,7 @@ endif() # Logging capabilities using GLog library. if(WITH_CYCLES_LOGGING) add_definitions(-DWITH_CYCLES_LOGGING) - add_definitions(-DGOOGLE_GLOG_DLL_DECL=) + add_definitions(${GLOG_DEFINES}) add_definitions(-DCYCLES_GFLAGS_NAMESPACE=${GFLAGS_NAMESPACE}) include_directories( SYSTEM diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt index aabb8f63640..08a3931ef46 100644 --- a/intern/cycles/app/CMakeLists.txt +++ b/intern/cycles/app/CMakeLists.txt @@ -35,18 +35,15 @@ if(WITH_CYCLES_OSL) list(APPEND LIBRARIES cycles_kernel_osl) endif() -if(CYCLES_STANDALONE_REPOSITORY) - if(WITH_CYCLES_LOGGING) - list(APPEND LIBRARIES - ${GLOG_LIBRARIES} - ${GFLAGS_LIBRARIES} - ) - endif() -else() +if(NOT CYCLES_STANDALONE_REPOSITORY) list(APPEND LIBRARIES bf_intern_glew_mx bf_intern_guardedalloc) - if(WITH_CYCLES_LOGGING) - list(APPEND LIBRARIES extern_glog extern_gflags) - endif() +endif() + +if(WITH_CYCLES_LOGGING) + list(APPEND LIBRARIES + ${GLOG_LIBRARIES} + ${GFLAGS_LIBRARIES} + ) endif() if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI) diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py index eb792af7264..a2d6262fb20 100644 --- a/intern/cycles/blender/addon/__init__.py +++ b/intern/cycles/blender/addon/__init__.py @@ -102,6 +102,9 @@ class CyclesRender(bpy.types.RenderEngine): else: self.report({'ERROR'}, "OSL support disabled in this build.") + def update_render_passes(self, scene, srl): + engine.register_passes(self, scene, srl) + def engine_exit(): engine.exit() diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index ab57dd44bdb..3018fd5b316 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -205,3 +205,49 @@ def with_network(): def system_info(): import _cycles return _cycles.system_info() + +def register_passes(engine, scene, srl): + engine.register_pass(scene, srl, "Combined", 4, "RGBA", 'COLOR') + + if srl.use_pass_z: engine.register_pass(scene, srl, "Depth", 1, "Z", 'VALUE') + if srl.use_pass_mist: engine.register_pass(scene, srl, "Mist", 1, "Z", 'VALUE') + if srl.use_pass_normal: engine.register_pass(scene, srl, "Normal", 3, "XYZ", 'VECTOR') + if srl.use_pass_vector: engine.register_pass(scene, srl, "Vector", 4, "XYZW", 'VECTOR') + if srl.use_pass_uv: engine.register_pass(scene, srl, "UV", 3, "UVA", 'VECTOR') + if srl.use_pass_object_index: engine.register_pass(scene, srl, "IndexOB", 1, "X", 'VALUE') + if srl.use_pass_material_index: engine.register_pass(scene, srl, "IndexMA", 1, "X", 'VALUE') + if srl.use_pass_shadow: engine.register_pass(scene, srl, "Shadow", 3, "RGB", 'COLOR') + if srl.use_pass_ambient_occlusion: engine.register_pass(scene, srl, "AO", 3, "RGB", 'COLOR') + if srl.use_pass_diffuse_direct: engine.register_pass(scene, srl, "DiffDir", 3, "RGB", 'COLOR') + if srl.use_pass_diffuse_indirect: engine.register_pass(scene, srl, "DiffInd", 3, "RGB", 'COLOR') + if srl.use_pass_diffuse_color: engine.register_pass(scene, srl, "DiffCol", 3, "RGB", 'COLOR') + if srl.use_pass_glossy_direct: engine.register_pass(scene, srl, "GlossDir", 3, "RGB", 'COLOR') + if srl.use_pass_glossy_indirect: engine.register_pass(scene, srl, "GlossInd", 3, "RGB", 'COLOR') + if srl.use_pass_glossy_color: engine.register_pass(scene, srl, "GlossCol", 3, "RGB", 'COLOR') + if srl.use_pass_transmission_direct: engine.register_pass(scene, srl, "TransDir", 3, "RGB", 'COLOR') + if srl.use_pass_transmission_indirect: engine.register_pass(scene, srl, "TransInd", 3, "RGB", 'COLOR') + if srl.use_pass_transmission_color: engine.register_pass(scene, srl, "TransCol", 3, "RGB", 'COLOR') + if srl.use_pass_subsurface_direct: engine.register_pass(scene, srl, "SubsurfaceDir", 3, "RGB", 'COLOR') + if srl.use_pass_subsurface_indirect: engine.register_pass(scene, srl, "SubsurfaceInd", 3, "RGB", 'COLOR') + if srl.use_pass_subsurface_color: engine.register_pass(scene, srl, "SubsurfaceCol", 3, "RGB", 'COLOR') + if srl.use_pass_emit: engine.register_pass(scene, srl, "Emit", 3, "RGB", 'COLOR') + if srl.use_pass_environment: engine.register_pass(scene, srl, "Env", 3, "RGB", 'COLOR') + + crl = srl.cycles + if crl.pass_debug_bvh_traversed_nodes: engine.register_pass(scene, srl, "Debug BVH Traversed Nodes", 1, "X", 'VALUE') + if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE') + if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE') + if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE') + + cscene = scene.cycles + if crl.use_denoising and crl.denoising_store_passes and not cscene.use_progressive_refine: + engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE') + engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE') + engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Image", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR') diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index cbf469b3a89..68474529ed3 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -695,10 +695,17 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): update=devices_update_callback ) - cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=False, update=devices_update_callback); + cls.debug_opencl_kernel_single_program = BoolProperty( + name="Single Program", + default=True, + update=devices_update_callback, + ) cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False) + cls.debug_opencl_mem_limit = IntProperty(name="Memory limit", default=0, + description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)") + @classmethod def unregister(cls): del bpy.types.Scene.cycles @@ -1166,6 +1173,125 @@ class CyclesCurveRenderSettings(bpy.types.PropertyGroup): def unregister(cls): del bpy.types.Scene.cycles_curves +def update_render_passes(self, context): + scene = context.scene + rd = scene.render + rl = rd.layers.active + rl.update_render_passes() + +class CyclesRenderLayerSettings(bpy.types.PropertyGroup): + @classmethod + def register(cls): + bpy.types.SceneRenderLayer.cycles = PointerProperty( + name="Cycles SceneRenderLayer Settings", + description="Cycles SceneRenderLayer Settings", + type=cls, + ) + cls.pass_debug_bvh_traversed_nodes = BoolProperty( + name="Debug BVH Traversed Nodes", + description="Store Debug BVH Traversed Nodes pass", + default=False, + update=update_render_passes, + ) + cls.pass_debug_bvh_traversed_instances = BoolProperty( + name="Debug BVH Traversed Instances", + description="Store Debug BVH Traversed Instances pass", + default=False, + update=update_render_passes, + ) + cls.pass_debug_bvh_intersections = BoolProperty( + name="Debug BVH Intersections", + description="Store Debug BVH Intersections", + default=False, + update=update_render_passes, + ) + cls.pass_debug_ray_bounces = BoolProperty( + name="Debug Ray Bounces", + description="Store Debug Ray Bounces pass", + default=False, + update=update_render_passes, + ) + + cls.use_denoising = BoolProperty( + name="Use Denoising", + description="Denoise the rendered image", + default=False, + update=update_render_passes, + ) + cls.denoising_diffuse_direct = BoolProperty( + name="Diffuse Direct", + description="Denoise the direct diffuse lighting", + default=True, + ) + cls.denoising_diffuse_indirect = BoolProperty( + name="Diffuse Indirect", + description="Denoise the indirect diffuse lighting", + default=True, + ) + cls.denoising_glossy_direct = BoolProperty( + name="Glossy Direct", + description="Denoise the direct glossy lighting", + default=True, + ) + cls.denoising_glossy_indirect = BoolProperty( + name="Glossy Indirect", + description="Denoise the indirect glossy lighting", + default=True, + ) + cls.denoising_transmission_direct = BoolProperty( + name="Transmission Direct", + description="Denoise the direct transmission lighting", + default=True, + ) + cls.denoising_transmission_indirect = BoolProperty( + name="Transmission Indirect", + description="Denoise the indirect transmission lighting", + default=True, + ) + cls.denoising_subsurface_direct = BoolProperty( + name="Subsurface Direct", + description="Denoise the direct subsurface lighting", + default=True, + ) + cls.denoising_subsurface_indirect = BoolProperty( + name="Subsurface Indirect", + description="Denoise the indirect subsurface lighting", + default=True, + ) + cls.denoising_strength = FloatProperty( + name="Denoising Strength", + description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)", + min=0.0, max=1.0, + default=0.5, + ) + cls.denoising_feature_strength = FloatProperty( + name="Denoising Feature Strength", + description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)", + min=0.0, max=1.0, + default=0.5, + ) + cls.denoising_radius = IntProperty( + name="Denoising Radius", + description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)", + min=1, max=25, + default=8, + ) + cls.denoising_relative_pca = BoolProperty( + name="Relative filter", + description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)", + default=False, + ) + cls.denoising_store_passes = BoolProperty( + name="Store denoising passes", + description="Store the denoising feature passes and the noisy image", + default=False, + update=update_render_passes, + ) + + @classmethod + def unregister(cls): + del bpy.types.SceneRenderLayer.cycles + class CyclesCurveSettings(bpy.types.PropertyGroup): @classmethod @@ -1297,14 +1423,14 @@ class CyclesPreferences(bpy.types.AddonPreferences): row = layout.row() if self.compute_device_type == 'CUDA' and cuda_devices: - col = row.column(align=True) + box = row.box() for device in cuda_devices: - col.prop(device, "use", text=device.name, toggle=True) + box.prop(device, "use", text=device.name) if self.compute_device_type == 'OPENCL' and opencl_devices: - col = row.column(align=True) + box = row.box() for device in opencl_devices: - col.prop(device, "use", text=device.name, toggle=True) + box.prop(device, "use", text=device.name) def draw(self, context): @@ -1324,6 +1450,7 @@ def register(): bpy.utils.register_class(CyclesCurveSettings) bpy.utils.register_class(CyclesDeviceSettings) bpy.utils.register_class(CyclesPreferences) + bpy.utils.register_class(CyclesRenderLayerSettings) def unregister(): @@ -1339,3 +1466,4 @@ def unregister(): bpy.utils.unregister_class(CyclesCurveSettings) bpy.utils.unregister_class(CyclesDeviceSettings) bpy.utils.unregister_class(CyclesPreferences) + bpy.utils.unregister_class(CyclesRenderLayerSettings) diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 2b50d272be8..49beebe5ab4 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -78,7 +78,7 @@ def use_cuda(context): def use_branched_path(context): cscene = context.scene.cycles - return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context)) + return (cscene.progressive == 'BRANCHED_PATH') def use_sample_all_lights(context): @@ -156,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): row = layout.row() sub = row.row() - sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context) sub.prop(cscene, "progressive", text="") row.prop(cscene, "use_square_samples") @@ -204,8 +203,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): col.prop(cscene, "sample_all_lights_direct") col.prop(cscene, "sample_all_lights_indirect") - if not (use_opencl(context) and cscene.feature_set != 'EXPERIMENTAL'): - layout.row().prop(cscene, "sampling_pattern", text="Pattern") + layout.row().prop(cscene, "sampling_pattern", text="Pattern") for rl in scene.render.layers: if rl.samples > 0: @@ -478,11 +476,14 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel): bl_options = {'DEFAULT_CLOSED'} def draw(self, context): + import _cycles + layout = self.layout scene = context.scene rd = scene.render rl = rd.layers.active + crl = rl.cycles split = layout.split() @@ -529,8 +530,18 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel): col.prop(rl, "use_pass_emit", text="Emission") col.prop(rl, "use_pass_environment") - if hasattr(rd, "debug_pass_type"): - layout.prop(rd, "debug_pass_type") + if context.scene.cycles.feature_set == 'EXPERIMENTAL': + col.separator() + sub = col.column() + sub.active = crl.use_denoising + sub.prop(crl, "denoising_store_passes", text="Denoising") + + if _cycles.with_cycles_debug: + col = layout.column() + col.prop(crl, "pass_debug_bvh_traversed_nodes") + col.prop(crl, "pass_debug_bvh_traversed_instances") + col.prop(crl, "pass_debug_bvh_intersections") + col.prop(crl, "pass_debug_ray_bounces") class CyclesRender_PT_views(CyclesButtonsPanel, Panel): @@ -576,6 +587,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel): row.prop(rv, "camera_suffix", text="") +class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel): + bl_label = "Denoising" + bl_context = "render_layer" + bl_options = {'DEFAULT_CLOSED'} + + def draw_header(self, context): + rd = context.scene.render + rl = rd.layers.active + crl = rl.cycles + cscene = context.scene.cycles + layout = self.layout + + layout.active = not cscene.use_progressive_refine + layout.prop(crl, "use_denoising", text="") + + def draw(self, context): + layout = self.layout + + scene = context.scene + cscene = scene.cycles + rd = scene.render + rl = rd.layers.active + crl = rl.cycles + + layout.active = crl.use_denoising and not cscene.use_progressive_refine + + split = layout.split() + + col = split.column() + sub = col.column(align=True) + sub.prop(crl, "denoising_radius", text="Radius") + sub.prop(crl, "denoising_strength", slider=True, text="Strength") + + col = split.column() + sub = col.column(align=True) + sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength") + sub.prop(crl, "denoising_relative_pca") + + layout.separator() + + row = layout.row() + row.label(text="Diffuse:") + sub = row.row(align=True) + sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True) + + row = layout.row() + row.label(text="Glossy:") + sub = row.row(align=True) + sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True) + + row = layout.row() + row.label(text="Transmission:") + sub = row.row(align=True) + sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True) + + row = layout.row() + row.label(text="Subsurface:") + sub = row.row(align=True) + sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True) + + class Cycles_PT_post_processing(CyclesButtonsPanel, Panel): bl_label = "Post Processing" bl_options = {'DEFAULT_CLOSED'} @@ -1532,6 +1608,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): col.prop(cscene, "debug_opencl_device_type", text="Device") col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program") col.prop(cscene, "debug_use_opencl_debug", text="Debug") + col.prop(cscene, "debug_opencl_mem_limit") class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel): @@ -1634,7 +1711,7 @@ def draw_device(self, context): layout.prop(cscene, "feature_set") - split = layout.split(percentage=1/3) + split = layout.split(percentage=1 / 3) split.label("Device:") row = split.row() row.active = show_device_active(context) @@ -1729,6 +1806,7 @@ classes = ( CyclesRender_PT_layer_options, CyclesRender_PT_layer_passes, CyclesRender_PT_views, + CyclesRender_PT_denoising, Cycles_PT_post_processing, CyclesCamera_PT_dof, Cycles_PT_context_material, diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp index 6fa038e8bf0..42b985305ea 100644 --- a/intern/cycles/blender/blender_curves.cpp +++ b/intern/cycles/blender/blender_curves.cpp @@ -411,7 +411,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, } } - mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size()); + mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles()); mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL); mesh->attributes.remove(ATTR_STD_FACE_NORMAL); mesh->add_face_normals(); @@ -546,7 +546,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh, } } - mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size()); + mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles()); mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL); mesh->attributes.remove(ATTR_STD_FACE_NORMAL); mesh->add_face_normals(); @@ -776,17 +776,17 @@ static void ExportCurveTriangleVcol(ParticleCurveData *CData, for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; curvekey++) { for(int section = 0; section < resol; section++) { - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; } } @@ -1004,7 +1004,7 @@ void BlenderSync::sync_curves(Mesh *mesh, for(size_t curve = 0; curve < CData.curve_vcol.size(); curve++) if(!(CData.curve_keynum[curve] <= 1 || CData.curve_length[curve] == 0.0f)) - fdata[i++] = color_srgb_to_scene_linear(CData.curve_vcol[curve]); + fdata[i++] = color_srgb_to_scene_linear_v3(CData.curve_vcol[curve]); } } } diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp index 54571b1fea1..b4cca5f00f4 100644 --- a/intern/cycles/blender/blender_mesh.cpp +++ b/intern/cycles/blender/blender_mesh.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ - #include "render/mesh.h" #include "render/object.h" #include "render/scene.h" @@ -51,8 +50,7 @@ enum { * Two triangles has vertex indices in the original Blender-side face. * If face is already a quad tri_b will not be initialized. */ -inline void face_split_tri_indices(const int num_verts, - const int face_flag, +inline void face_split_tri_indices(const int face_flag, int tri_a[3], int tri_b[3]) { @@ -60,21 +58,19 @@ inline void face_split_tri_indices(const int num_verts, tri_a[0] = 0; tri_a[1] = 1; tri_a[2] = 3; - if(num_verts == 4) { - tri_b[0] = 2; - tri_b[1] = 3; - tri_b[2] = 1; - } + + tri_b[0] = 2; + tri_b[1] = 3; + tri_b[2] = 1; } else /*if(face_flag & FACE_FLAG_DIVIDE_13)*/ { tri_a[0] = 0; tri_a[1] = 1; tri_a[2] = 2; - if(num_verts == 4) { - tri_b[0] = 0; - tri_b[1] = 2; - tri_b[2] = 3; - } + + tri_b[0] = 0; + tri_b[1] = 2; + tri_b[2] = 3; } } @@ -251,7 +247,7 @@ static void mikk_compute_tangents(BL::Mesh& b_mesh, for(int i = 0; i < nverts.size(); i++) { int tri_a[3], tri_b[3]; - face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b); + face_split_tri_indices(face_flags[i], tri_a, tri_b); tangent[0] = float4_to_float3(userdata.tangent[i*4 + tri_a[0]]); tangent[1] = float4_to_float3(userdata.tangent[i*4 + tri_a[1]]); @@ -293,7 +289,7 @@ static void create_mesh_volume_attribute(BL::Object& b_ob, if(!b_domain) return; - + Attribute *attr = mesh->attributes.add(std); VoxelAttribute *volume_data = attr->data_voxel(); bool is_float, is_linear; @@ -356,7 +352,7 @@ static void attr_create_vertex_color(Scene *scene, int n = p->loop_total(); for(int i = 0; i < n; i++) { float3 color = get_float3(l->data[p->loop_start() + i].color()); - *(cdata++) = color_float_to_byte(color_srgb_to_scene_linear(color)); + *(cdata++) = color_float_to_byte(color_srgb_to_scene_linear_v3(color)); } } } @@ -377,14 +373,14 @@ static void attr_create_vertex_color(Scene *scene, for(l->data.begin(c); c != l->data.end(); ++c, ++i) { int tri_a[3], tri_b[3]; - face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b); + face_split_tri_indices(face_flags[i], tri_a, tri_b); uchar4 colors[4]; - colors[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1()))); - colors[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2()))); - colors[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3()))); + colors[0] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color1()))); + colors[1] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color2()))); + colors[2] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color3()))); if(nverts[i] == 4) { - colors[3] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4()))); + colors[3] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color4()))); } cdata[0] = colors[tri_a[0]]; @@ -470,7 +466,7 @@ static void attr_create_uv_map(Scene *scene, for(l->data.begin(t); t != l->data.end(); ++t, ++i) { int tri_a[3], tri_b[3]; - face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b); + face_split_tri_indices(face_flags[i], tri_a, tri_b); float3 uvs[4]; uvs[0] = get_float3(t->uv1()); @@ -982,7 +978,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob, else used_shaders.push_back(scene->default_surface); } - + /* test if we need to sync */ int requested_geometry_flags = Mesh::GEOMETRY_NONE; if(render_layer.use_surfaces) { @@ -1017,12 +1013,12 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob, /* ensure we only sync instanced meshes once */ if(mesh_synced.find(mesh) != mesh_synced.end()) return mesh; - + mesh_synced.insert(mesh); /* create derived mesh */ array<int> oldtriangle = mesh->triangles; - + /* compares curve_keys rather than strands in order to handle quick hair * adjustments in dynamic BVH - other methods could probably do this better*/ array<float3> oldcurve_keys = mesh->curve_keys; @@ -1111,7 +1107,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob, if(memcmp(&oldcurve_radius[0], &mesh->curve_radius[0], sizeof(float)*oldcurve_radius.size()) != 0) rebuild = true; } - + mesh->tag_update(scene, rebuild); return mesh; @@ -1140,7 +1136,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, if(scene->need_motion() == Scene::MOTION_BLUR) { if(!mesh->use_motion_blur) return; - + /* see if this mesh needs motion data at this time */ vector<float> object_times = object->motion_times(); bool found = false; @@ -1172,7 +1168,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, if(!numverts && !numkeys) return; - + /* skip objects without deforming modifiers. this is not totally reliable, * would need a more extensive check to see which objects are animated */ BL::Mesh b_mesh(PointerRNA_NULL); diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp index d05699236cc..a930c439370 100644 --- a/intern/cycles/blender/blender_object.cpp +++ b/intern/cycles/blender/blender_object.cpp @@ -379,27 +379,16 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, } } - /* random number */ - object->random_id = hash_string(object->name.c_str()); - - if(persistent_id) { - for(int i = 0; i < OBJECT_PERSISTENT_ID_SIZE; i++) - object->random_id = hash_int_2d(object->random_id, persistent_id[i]); - } - else - object->random_id = hash_int_2d(object->random_id, 0); - - if(b_parent.ptr.data != b_ob.ptr.data) - object->random_id ^= hash_int(hash_string(b_parent.name().c_str())); - - /* dupli texture coordinates */ + /* dupli texture coordinates and random_id */ if(b_dupli_ob) { object->dupli_generated = 0.5f*get_float3(b_dupli_ob.orco()) - make_float3(0.5f, 0.5f, 0.5f); object->dupli_uv = get_float2(b_dupli_ob.uv()); + object->random_id = b_dupli_ob.random_id(); } else { object->dupli_generated = make_float3(0.0f, 0.0f, 0.0f); object->dupli_uv = make_float2(0.0f, 0.0f); + object->random_id = hash_int_2d(hash_string(object->name.c_str()), 0); } object->tag_update(scene); @@ -489,7 +478,7 @@ static bool object_render_hide_duplis(BL::Object& b_ob) /* Object Loop */ -void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time) +void BlenderSync::sync_objects(float motion_time) { /* layer data */ uint scene_layer = render_layer.scene_layer; @@ -517,7 +506,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time) * 1 : DAG_EVAL_PREVIEW * 2 : DAG_EVAL_RENDER */ - int dupli_settings = preview ? 1 : 2; + int dupli_settings = (render_layer.use_viewport_visibility) ? 1 : 2; bool cancel = false; bool use_portal = false; @@ -552,7 +541,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time) for(b_ob.dupli_list.begin(b_dup); b_dup != b_ob.dupli_list.end(); ++b_dup) { Transform tfm = get_transform(b_dup->matrix()); BL::Object b_dup_ob = b_dup->object(); - bool dup_hide = (b_v3d)? b_dup_ob.hide(): b_dup_ob.hide_render(); + bool dup_hide = (render_layer.use_viewport_visibility)? b_dup_ob.hide(): b_dup_ob.hide_render(); bool in_dupli_group = (b_dup->type() == BL::DupliObject::type_GROUP); bool hide_tris; @@ -628,7 +617,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time) } void BlenderSync::sync_motion(BL::RenderSettings& b_render, - BL::SpaceView3D& b_v3d, BL::Object& b_override, int width, int height, void **python_thread_state) @@ -665,7 +653,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render, b_engine.frame_set(frame, subframe); python_thread_state_save(python_thread_state); sync_camera_motion(b_render, b_cam, width, height, 0.0f); - sync_objects(b_v3d, 0.0f); + sync_objects(0.0f); } /* always sample these times for camera motion */ @@ -699,7 +687,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render, } /* sync object */ - sync_objects(b_v3d, relative_time); + sync_objects(relative_time); } /* we need to set the python thread state again because this diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index d509e9de981..54973fd1b7f 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -106,6 +106,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) } /* Synchronize other OpenCL flags. */ flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug"); + flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit"))*1024*1024; flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program"); return flags.opencl.device_type != opencl_device_type || flags.opencl.kernel_type != opencl_kernel_type; @@ -811,6 +812,14 @@ void *CCL_python_module_init() PyModule_AddStringConstant(mod, "osl_version_string", "unknown"); #endif +#ifdef WITH_CYCLES_DEBUG + PyModule_AddObject(mod, "with_cycles_debug", Py_True); + Py_INCREF(Py_True); +#else + PyModule_AddObject(mod, "with_cycles_debug", Py_False); + Py_INCREF(Py_False); +#endif + #ifdef WITH_NETWORK PyModule_AddObject(mod, "with_network", Py_True); Py_INCREF(Py_True); diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 26f9bccd95d..12de3da063f 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -129,9 +129,9 @@ void BlenderSession::create_session() scene = new Scene(scene_params, session_params.device); /* setup callbacks for builtin image support */ - scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7); - scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4); - scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4); + scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7, _8); + scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4, _5); + scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4, _5); /* create session */ session = new Session(session_params); @@ -243,90 +243,6 @@ void BlenderSession::free_session() delete session; } -static PassType get_pass_type(BL::RenderPass& b_pass) -{ - switch(b_pass.type()) { - case BL::RenderPass::type_COMBINED: - return PASS_COMBINED; - - case BL::RenderPass::type_Z: - return PASS_DEPTH; - case BL::RenderPass::type_MIST: - return PASS_MIST; - case BL::RenderPass::type_NORMAL: - return PASS_NORMAL; - case BL::RenderPass::type_OBJECT_INDEX: - return PASS_OBJECT_ID; - case BL::RenderPass::type_UV: - return PASS_UV; - case BL::RenderPass::type_VECTOR: - return PASS_MOTION; - case BL::RenderPass::type_MATERIAL_INDEX: - return PASS_MATERIAL_ID; - - case BL::RenderPass::type_DIFFUSE_DIRECT: - return PASS_DIFFUSE_DIRECT; - case BL::RenderPass::type_GLOSSY_DIRECT: - return PASS_GLOSSY_DIRECT; - case BL::RenderPass::type_TRANSMISSION_DIRECT: - return PASS_TRANSMISSION_DIRECT; - case BL::RenderPass::type_SUBSURFACE_DIRECT: - return PASS_SUBSURFACE_DIRECT; - - case BL::RenderPass::type_DIFFUSE_INDIRECT: - return PASS_DIFFUSE_INDIRECT; - case BL::RenderPass::type_GLOSSY_INDIRECT: - return PASS_GLOSSY_INDIRECT; - case BL::RenderPass::type_TRANSMISSION_INDIRECT: - return PASS_TRANSMISSION_INDIRECT; - case BL::RenderPass::type_SUBSURFACE_INDIRECT: - return PASS_SUBSURFACE_INDIRECT; - - case BL::RenderPass::type_DIFFUSE_COLOR: - return PASS_DIFFUSE_COLOR; - case BL::RenderPass::type_GLOSSY_COLOR: - return PASS_GLOSSY_COLOR; - case BL::RenderPass::type_TRANSMISSION_COLOR: - return PASS_TRANSMISSION_COLOR; - case BL::RenderPass::type_SUBSURFACE_COLOR: - return PASS_SUBSURFACE_COLOR; - - case BL::RenderPass::type_EMIT: - return PASS_EMISSION; - case BL::RenderPass::type_ENVIRONMENT: - return PASS_BACKGROUND; - case BL::RenderPass::type_AO: - return PASS_AO; - case BL::RenderPass::type_SHADOW: - return PASS_SHADOW; - - case BL::RenderPass::type_DIFFUSE: - case BL::RenderPass::type_COLOR: - case BL::RenderPass::type_REFRACTION: - case BL::RenderPass::type_SPECULAR: - case BL::RenderPass::type_REFLECTION: - return PASS_NONE; -#ifdef WITH_CYCLES_DEBUG - case BL::RenderPass::type_DEBUG: - { - switch(b_pass.debug_type()) { - case BL::RenderPass::debug_type_BVH_TRAVERSED_NODES: - return PASS_BVH_TRAVERSED_NODES; - case BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES: - return PASS_BVH_TRAVERSED_INSTANCES; - case BL::RenderPass::debug_type_BVH_INTERSECTIONS: - return PASS_BVH_INTERSECTIONS; - case BL::RenderPass::debug_type_RAY_BOUNCES: - return PASS_RAY_BOUNCES; - } - break; - } -#endif - } - - return PASS_NONE; -} - static ShaderEvalType get_shader_type(const string& pass_type) { const char *shader_type = pass_type.c_str(); @@ -383,12 +299,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine, static void end_render_result(BL::RenderEngine& b_engine, BL::RenderResult& b_rr, bool cancel, + bool highlight, bool do_merge_results) { - b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results); + b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results); } -void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only) +void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight) { BufferParams& params = rtile.buffers->params; int x = params.full_x - session->tile_manager.params.full_x; @@ -424,37 +341,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda update_render_result(b_rr, b_rlay, rtile); } - end_render_result(b_engine, b_rr, true, true); + end_render_result(b_engine, b_rr, true, highlight, true); } else { /* write result */ write_render_result(b_rr, b_rlay, rtile); - end_render_result(b_engine, b_rr, false, true); + end_render_result(b_engine, b_rr, false, false, true); } } void BlenderSession::write_render_tile(RenderTile& rtile) { - do_write_update_render_tile(rtile, false); + do_write_update_render_tile(rtile, false, false); } -void BlenderSession::update_render_tile(RenderTile& rtile) +void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight) { /* use final write for preview renders, otherwise render result wouldn't be * be updated in blender side * would need to be investigated a bit further, but for now shall be fine */ if(!b_engine.is_preview()) - do_write_update_render_tile(rtile, true); + do_write_update_render_tile(rtile, true, highlight); else - do_write_update_render_tile(rtile, false); + do_write_update_render_tile(rtile, false, false); } void BlenderSession::render() { /* set callback to write out render results */ session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1); - session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1); + session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2); /* get buffer parameters */ SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); @@ -475,33 +392,38 @@ void BlenderSession::render() /* layer will be missing if it was disabled in the UI */ if(b_single_rlay == b_rr.layers.end()) { - end_render_result(b_engine, b_rr, true, false); + end_render_result(b_engine, b_rr, true, true, false); continue; } BL::RenderLayer b_rlay = *b_single_rlay; /* add passes */ - array<Pass> passes; - Pass::add(PASS_COMBINED, passes); - - if(session_params.device.advanced_shading) { - - /* loop over passes */ - BL::RenderLayer::passes_iterator b_pass_iter; - - for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) { - BL::RenderPass b_pass(*b_pass_iter); - PassType pass_type = get_pass_type(b_pass); + array<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params); + buffer_params.passes = passes; - if(pass_type == PASS_MOTION && scene->integrator->motion_blur) - continue; - if(pass_type != PASS_NONE) - Pass::add(pass_type, passes); - } - } + PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles"); + bool use_denoising = !session_params.progressive_refine && get_boolean(crl, "use_denoising"); + buffer_params.denoising_data_pass = use_denoising; + session->tile_manager.schedule_denoising = use_denoising; + session->params.use_denoising = use_denoising; + scene->film->denoising_data_pass = buffer_params.denoising_data_pass; + scene->film->denoising_flags = 0; + if(!get_boolean(crl, "denoising_diffuse_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR; + if(!get_boolean(crl, "denoising_diffuse_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND; + if(!get_boolean(crl, "denoising_glossy_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR; + if(!get_boolean(crl, "denoising_glossy_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND; + if(!get_boolean(crl, "denoising_transmission_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR; + if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND; + if(!get_boolean(crl, "denoising_subsurface_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR; + if(!get_boolean(crl, "denoising_subsurface_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND; + scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES); + buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass; + session->params.denoising_radius = get_int(crl, "denoising_radius"); + session->params.denoising_strength = get_float(crl, "denoising_strength"); + session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength"); + session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca"); - buffer_params.passes = passes; scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold(); scene->film->tag_passes_update(scene, passes); scene->film->tag_update(scene); @@ -555,7 +477,7 @@ void BlenderSession::render() } /* free result without merging */ - end_render_result(b_engine, b_rr, true, false); + end_render_result(b_engine, b_rr, true, true, false); if(session->progress.get_cancel()) break; @@ -636,8 +558,6 @@ void BlenderSession::bake(BL::Object& b_object, float result[]) { ShaderEvalType shader_type = get_shader_type(pass_type); - size_t object_index = OBJECT_NONE; - int tri_offset = 0; /* Set baking flag in advance, so kernel loading can check if we need * any baking capabilities. @@ -647,9 +567,6 @@ void BlenderSession::bake(BL::Object& b_object, /* ensure kernels are loaded before we do any scene updates */ session->load_kernels(); - if(session->progress.get_cancel()) - return; - if(shader_type == SHADER_EVAL_UV) { /* force UV to be available */ Pass::add(PASS_UV, scene->film->passes); @@ -667,50 +584,61 @@ void BlenderSession::bake(BL::Object& b_object, scene->film->tag_update(scene); scene->integrator->tag_update(scene); - /* update scene */ - BL::Object b_camera_override(b_engine.camera_override()); - sync->sync_camera(b_render, b_camera_override, width, height, ""); - sync->sync_data(b_render, - b_v3d, - b_camera_override, - width, height, - &python_thread_state, - b_rlay_name.c_str()); + if(!session->progress.get_cancel()) { + /* update scene */ + BL::Object b_camera_override(b_engine.camera_override()); + sync->sync_camera(b_render, b_camera_override, width, height, ""); + sync->sync_data(b_render, + b_v3d, + b_camera_override, + width, height, + &python_thread_state, + b_rlay_name.c_str()); + } - /* get buffer parameters */ - SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); - BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height); + BakeData *bake_data = NULL; + + if(!session->progress.get_cancel()) { + /* get buffer parameters */ + SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); + BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height); - scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y()); + scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y()); - /* set number of samples */ - session->tile_manager.set_samples(session_params.samples); - session->reset(buffer_params, session_params.samples); - session->update_scene(); + /* set number of samples */ + session->tile_manager.set_samples(session_params.samples); + session->reset(buffer_params, session_params.samples); + session->update_scene(); - /* find object index. todo: is arbitrary - copied from mesh_displace.cpp */ - for(size_t i = 0; i < scene->objects.size(); i++) { - if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) { - object_index = i; - tri_offset = scene->objects[i]->mesh->tri_offset; - break; - } - } + /* find object index. todo: is arbitrary - copied from mesh_displace.cpp */ + size_t object_index = OBJECT_NONE; + int tri_offset = 0; - int object = object_index; + for(size_t i = 0; i < scene->objects.size(); i++) { + if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) { + object_index = i; + tri_offset = scene->objects[i]->mesh->tri_offset; + break; + } + } - BakeData *bake_data = scene->bake_manager->init(object, tri_offset, num_pixels); + int object = object_index; - populate_bake_data(bake_data, object_id, pixel_array, num_pixels); + bake_data = scene->bake_manager->init(object, tri_offset, num_pixels); + populate_bake_data(bake_data, object_id, pixel_array, num_pixels); - /* set number of samples */ - session->tile_manager.set_samples(session_params.samples); - session->reset(buffer_params, session_params.samples); - session->update_scene(); + /* set number of samples */ + session->tile_manager.set_samples(session_params.samples); + session->reset(buffer_params, session_params.samples); + session->update_scene(); - session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this)); + session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this)); + } - scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result); + /* Perform bake. Check cancel to avoid crash with incomplete scene data. */ + if(!session->progress.get_cancel()) { + scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result); + } /* free all memory used (host and device), so we wouldn't leave render * engine with extra memory allocated @@ -753,19 +681,31 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr, BL::RenderPass b_pass(*b_iter); /* find matching pass type */ - PassType pass_type = get_pass_type(b_pass); + PassType pass_type = BlenderSync::get_pass_type(b_pass); int components = b_pass.channels(); - /* copy pixels */ - if(!buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0])) + bool read = false; + if(pass_type != PASS_NONE) { + /* copy pixels */ + read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]); + } + else { + int denoising_offset = BlenderSync::get_denoising_pass(b_pass); + if(denoising_offset >= 0) { + read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]); + } + } + + if(!read) { memset(&pixels[0], 0, pixels.size()*sizeof(float)); + } b_pass.rect(&pixels[0]); } } else { /* copy combined pass */ - BL::RenderPass b_combined_pass(b_rlay.passes.find_by_type(BL::RenderPass::type_COMBINED, b_rview_name.c_str())); + BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str())); if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0])) b_combined_pass.rect(&pixels[0]); } @@ -1073,7 +1013,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name, int &width, int &height, int &depth, - int &channels) + int &channels, + bool& free_cache) { /* empty image */ is_float = false; @@ -1081,6 +1022,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name, height = 1; depth = 0; channels = 0; + free_cache = false; if(!builtin_data) return; @@ -1094,6 +1036,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name, /* image data */ BL::Image b_image(b_id); + free_cache = !b_image.has_data(); is_float = b_image.is_float(); width = b_image.size()[0]; height = b_image.size()[1]; @@ -1154,7 +1097,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name, bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels, - const size_t pixels_size) + const size_t pixels_size, + const bool free_cache) { if(!builtin_data) { return false; @@ -1175,7 +1119,6 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, if(image_pixels && num_pixels * channels == pixels_size) { memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char)); - MEM_freeN(image_pixels); } else { if(channels == 1) { @@ -1194,6 +1137,16 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, } } } + + if(image_pixels) { + MEM_freeN(image_pixels); + } + + /* Free image buffers to save memory during render. */ + if(free_cache) { + b_image.buffers_free(); + } + /* Premultiply, byte images are always straight for Blender. */ unsigned char *cp = pixels; for(size_t i = 0; i < num_pixels; i++, cp += channels) { @@ -1207,7 +1160,8 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels, - const size_t pixels_size) + const size_t pixels_size, + const bool free_cache) { if(!builtin_data) { return false; @@ -1232,7 +1186,6 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, if(image_pixels && num_pixels * channels == pixels_size) { memcpy(pixels, image_pixels, pixels_size * sizeof(float)); - MEM_freeN(image_pixels); } else { if(channels == 1) { @@ -1252,6 +1205,15 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, } } + if(image_pixels) { + MEM_freeN(image_pixels); + } + + /* Free image buffers to save memory during render. */ + if(free_cache) { + b_image.buffers_free(); + } + return true; } else if(b_id.is_a(&RNA_Object)) { diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index 22b21a18f2e..cbd2303d282 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -79,7 +79,7 @@ public: void update_render_result(BL::RenderResult& b_rr, BL::RenderLayer& b_rlay, RenderTile& rtile); - void update_render_tile(RenderTile& rtile); + void update_render_tile(RenderTile& rtile, bool highlight); /* interactive updates */ void synchronize(); @@ -147,7 +147,7 @@ protected: BL::RenderLayer& b_rlay, RenderTile& rtile, bool do_update_only); - void do_write_update_render_tile(RenderTile& rtile, bool do_update_only); + void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight); int builtin_image_frame(const string &builtin_name); void builtin_image_info(const string &builtin_name, @@ -156,15 +156,18 @@ protected: int &width, int &height, int &depth, - int &channels); + int &channels, + bool &free_cache); bool builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels, - const size_t pixels_size); + const size_t pixels_size, + const bool free_cache); bool builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels, - const size_t pixels_size); + const size_t pixels_size, + const bool free_cache); /* Update tile manager to reflect resumable render settings. */ void update_resumable_tile_manager(int num_samples); diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp index 3f04f11aab4..bdbab1006c0 100644 --- a/intern/cycles/blender/blender_shader.cpp +++ b/intern/cycles/blender/blender_shader.cpp @@ -521,6 +521,19 @@ static ShaderNode *add_node(Scene *scene, } node = hair; } + else if(b_node.is_a(&RNA_ShaderNodeBsdfPrincipled)) { + BL::ShaderNodeBsdfPrincipled b_principled_node(b_node); + PrincipledBsdfNode *principled = new PrincipledBsdfNode(); + switch (b_principled_node.distribution()) { + case BL::ShaderNodeBsdfPrincipled::distribution_GGX: + principled->distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID; + break; + case BL::ShaderNodeBsdfPrincipled::distribution_MULTI_GGX: + principled->distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID; + break; + } + node = principled; + } else if(b_node.is_a(&RNA_ShaderNodeBsdfTranslucent)) { node = new TranslucentBsdfNode(); } diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 3b071bf0e7d..3a00384458a 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -210,10 +210,9 @@ void BlenderSync::sync_data(BL::RenderSettings& b_render, scene->need_motion() == Scene::MOTION_NONE || scene->camera->motion_position == Camera::MOTION_POSITION_CENTER) { - sync_objects(b_v3d); + sync_objects(); } sync_motion(b_render, - b_v3d, b_override, width, height, python_thread_state); @@ -330,6 +329,9 @@ void BlenderSync::sync_integrator() integrator->ao_bounces = get_int(cscene, "ao_bounces_render"); } } + else { + integrator->ao_bounces = 0; + } if(integrator->modified(previntegrator)) integrator->tag_update(scene); @@ -480,6 +482,137 @@ void BlenderSync::sync_images() } } +/* Passes */ +PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass) +{ + string name = b_pass.name(); +#define MAP_PASS(passname, passtype) if(name == passname) return passtype; + /* NOTE: Keep in sync with defined names from DNA_scene_types.h */ + MAP_PASS("Combined", PASS_COMBINED); + MAP_PASS("Depth", PASS_DEPTH); + MAP_PASS("Mist", PASS_MIST); + MAP_PASS("Normal", PASS_NORMAL); + MAP_PASS("IndexOB", PASS_OBJECT_ID); + MAP_PASS("UV", PASS_UV); + MAP_PASS("Vector", PASS_MOTION); + MAP_PASS("IndexMA", PASS_MATERIAL_ID); + + MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT); + MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT); + MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT); + MAP_PASS("SubsurfaceDir", PASS_SUBSURFACE_DIRECT); + + MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT); + MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT); + MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT); + MAP_PASS("SubsurfaceInd", PASS_SUBSURFACE_INDIRECT); + + MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR); + MAP_PASS("GlossCol", PASS_GLOSSY_COLOR); + MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR); + MAP_PASS("SubsurfaceCol", PASS_SUBSURFACE_COLOR); + + MAP_PASS("Emit", PASS_EMISSION); + MAP_PASS("Env", PASS_BACKGROUND); + MAP_PASS("AO", PASS_AO); + MAP_PASS("Shadow", PASS_SHADOW); + +#ifdef __KERNEL_DEBUG__ + MAP_PASS("Debug BVH Traversed Nodes", PASS_BVH_TRAVERSED_NODES); + MAP_PASS("Debug BVH Traversed Instances", PASS_BVH_TRAVERSED_INSTANCES); + MAP_PASS("Debug BVH Intersections", PASS_BVH_INTERSECTIONS); + MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES); +#endif +#undef MAP_PASS + + return PASS_NONE; +} + +int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) +{ + string name = b_pass.name(); + if(name.substr(0, 10) != "Denoising ") { + return -1; + } + name = name.substr(10); + +#define MAP_PASS(passname, offset) if(name == passname) return offset; + MAP_PASS("Normal", DENOISING_PASS_NORMAL); + MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR); + MAP_PASS("Albedo", DENOISING_PASS_ALBEDO); + MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR); + MAP_PASS("Depth", DENOISING_PASS_DEPTH); + MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR); + MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A); + MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B); + MAP_PASS("Image", DENOISING_PASS_COLOR); + MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR); +#undef MAP_PASS + + return -1; +} + +array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, + BL::SceneRenderLayer& b_srlay, + const SessionParams &session_params) +{ + array<Pass> passes; + Pass::add(PASS_COMBINED, passes); + + if(!session_params.device.advanced_shading) { + return passes; + } + + /* loop over passes */ + BL::RenderLayer::passes_iterator b_pass_iter; + + for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) { + BL::RenderPass b_pass(*b_pass_iter); + PassType pass_type = get_pass_type(b_pass); + + if(pass_type == PASS_MOTION && scene->integrator->motion_blur) + continue; + if(pass_type != PASS_NONE) + Pass::add(pass_type, passes); + } + + PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles"); + if(get_boolean(crp, "denoising_store_passes") && + get_boolean(crp, "use_denoising") && + !session_params.progressive_refine) { + b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Image", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str()); + } +#ifdef __KERNEL_DEBUG__ + if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) { + b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str()); + Pass::add(PASS_BVH_TRAVERSED_NODES, passes); + } + if(get_boolean(crp, "pass_debug_bvh_traversed_instances")) { + b_engine.add_pass("Debug BVH Traversed Instances", 1, "X", b_srlay.name().c_str()); + Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes); + } + if(get_boolean(crp, "pass_debug_bvh_intersections")) { + b_engine.add_pass("Debug BVH Intersections", 1, "X", b_srlay.name().c_str()); + Pass::add(PASS_BVH_INTERSECTIONS, passes); + } + if(get_boolean(crp, "pass_debug_ray_bounces")) { + b_engine.add_pass("Debug Ray Bounces", 1, "X", b_srlay.name().c_str()); + Pass::add(PASS_RAY_BOUNCES, passes); + } +#endif + + return passes; +} + /* Scene Parameters */ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene, diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h index 36bedc505af..4ec46424b5a 100644 --- a/intern/cycles/blender/blender_sync.h +++ b/intern/cycles/blender/blender_sync.h @@ -67,6 +67,9 @@ public: void **python_thread_state, const char *layer = 0); void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer); + array<Pass> sync_render_passes(BL::RenderLayer& b_rlay, + BL::SceneRenderLayer& b_srlay, + const SessionParams &session_params); void sync_integrator(); void sync_camera(BL::RenderSettings& b_render, BL::Object& b_override, @@ -93,13 +96,15 @@ public: Camera *cam, int width, int height); + static PassType get_pass_type(BL::RenderPass& b_pass); + static int get_denoising_pass(BL::RenderPass& b_pass); + private: /* sync */ void sync_lamps(bool update_all); void sync_materials(bool update_all); - void sync_objects(BL::SpaceView3D& b_v3d, float motion_time = 0.0f); + void sync_objects(float motion_time = 0.0f); void sync_motion(BL::RenderSettings& b_render, - BL::SpaceView3D& b_v3d, BL::Object& b_override, int width, int height, void **python_thread_state); diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h index abdbb6be0fd..363e19f7a20 100644 --- a/intern/cycles/blender/blender_util.h +++ b/intern/cycles/blender/blender_util.h @@ -51,8 +51,8 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data, bool calc_undeformed, Mesh::SubdivisionType subdivision_type) { - bool subsurf_mod_show_render; - bool subsurf_mod_show_viewport; + bool subsurf_mod_show_render = false; + bool subsurf_mod_show_viewport = false; if(subdivision_type != Mesh::SUBDIVISION_NONE) { BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1]; @@ -299,7 +299,7 @@ static inline uint get_layer(const BL::Array<int, 20>& array) for(uint i = 0; i < 20; i++) if(array[i]) layer |= (1 << i); - + return layer; } @@ -434,7 +434,7 @@ static inline string get_string(PointerRNA& ptr, const char *name) string str(cstr); if(cstr != cstrbuf) MEM_freeN(cstr); - + return str; } @@ -451,7 +451,7 @@ static inline string blender_absolute_path(BL::BlendData& b_data, { if(path.size() >= 2 && path[0] == '/' && path[1] == '/') { string dirname; - + if(b_id.library()) { BL::ID b_library_id(b_id.library()); dirname = blender_absolute_path(b_data, @@ -544,7 +544,7 @@ static inline BL::SmokeDomainSettings object_smoke_domain_find(BL::Object& b_ob) return b_smd.domain_settings(); } } - + return BL::SmokeDomainSettings(PointerRNA_NULL); } @@ -816,4 +816,3 @@ protected: CCL_NAMESPACE_END #endif /* __BLENDER_UTIL_H__ */ - diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index 4701d75350a..6078db5a8ca 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -8,6 +8,8 @@ set(INC_SYS set(SRC bvh.cpp + bvh2.cpp + bvh4.cpp bvh_binning.cpp bvh_build.cpp bvh_node.cpp @@ -18,6 +20,8 @@ set(SRC set(SRC_HEADERS bvh.h + bvh2.h + bvh4.h bvh_binning.h bvh_build.h bvh_node.h diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index 58348d16746..33143e2d8aa 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -15,45 +15,32 @@ * limitations under the License. */ +#include "bvh/bvh.h" + #include "render/mesh.h" #include "render/object.h" -#include "render/scene.h" -#include "render/curves.h" -#include "bvh/bvh.h" +#include "bvh/bvh2.h" +#include "bvh/bvh4.h" #include "bvh/bvh_build.h" #include "bvh/bvh_node.h" -#include "bvh/bvh_params.h" -#include "bvh/bvh_unaligned.h" -#include "util/util_debug.h" #include "util/util_foreach.h" -#include "util/util_logging.h" -#include "util/util_map.h" #include "util/util_progress.h" -#include "util/util_system.h" -#include "util/util_types.h" -#include "util/util_math.h" CCL_NAMESPACE_BEGIN /* Pack Utility */ -struct BVHStackEntry +BVHStackEntry::BVHStackEntry(const BVHNode *n, int i) + : node(n), idx(i) { - const BVHNode *node; - int idx; - - BVHStackEntry(const BVHNode* n = 0, int i = 0) - : node(n), idx(i) - { - } +} - int encodeIdx() const - { - return (node->is_leaf())? ~idx: idx; - } -}; +int BVHStackEntry::encodeIdx() const +{ + return (node->is_leaf())? ~idx: idx; +} /* BVH */ @@ -65,9 +52,9 @@ BVH::BVH(const BVHParams& params_, const vector<Object*>& objects_) BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects) { if(params.use_qbvh) - return new QBVH(params, objects); + return new BVH4(params, objects); else - return new BinaryBVH(params, objects); + return new BVH2(params, objects); } /* Building */ @@ -418,832 +405,4 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) } } -/* Regular BVH */ - -static bool node_bvh_is_unaligned(const BVHNode *node) -{ - const BVHNode *node0 = node->get_child(0), - *node1 = node->get_child(1); - return node0->is_unaligned || node1->is_unaligned; -} - -BinaryBVH::BinaryBVH(const BVHParams& params_, const vector<Object*>& objects_) -: BVH(params_, objects_) -{ -} - -void BinaryBVH::pack_leaf(const BVHStackEntry& e, - const LeafNode *leaf) -{ - assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); - float4 data[BVH_NODE_LEAF_SIZE]; - memset(data, 0, sizeof(data)); - if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { - /* object */ - data[0].x = __int_as_float(~(leaf->lo)); - data[0].y = __int_as_float(0); - } - else { - /* triangle */ - data[0].x = __int_as_float(leaf->lo); - data[0].y = __int_as_float(leaf->hi); - } - data[0].z = __uint_as_float(leaf->visibility); - if(leaf->num_triangles() != 0) { - data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); - } - - memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE); -} - -void BinaryBVH::pack_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1) -{ - if(e0.node->is_unaligned || e1.node->is_unaligned) { - pack_unaligned_inner(e, e0, e1); - } else { - pack_aligned_inner(e, e0, e1); - } -} - -void BinaryBVH::pack_aligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1) -{ - pack_aligned_node(e.idx, - e0.node->bounds, e1.node->bounds, - e0.encodeIdx(), e1.encodeIdx(), - e0.node->visibility, e1.node->visibility); -} - -void BinaryBVH::pack_aligned_node(int idx, - const BoundBox& b0, - const BoundBox& b1, - int c0, int c1, - uint visibility0, uint visibility1) -{ - assert(idx + BVH_NODE_SIZE <= pack.nodes.size()); - assert(c0 < 0 || c0 < pack.nodes.size()); - assert(c1 < 0 || c1 < pack.nodes.size()); - - int4 data[BVH_NODE_SIZE] = { - make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED, - visibility1 & ~PATH_RAY_NODE_UNALIGNED, - c0, c1), - make_int4(__float_as_int(b0.min.x), - __float_as_int(b1.min.x), - __float_as_int(b0.max.x), - __float_as_int(b1.max.x)), - make_int4(__float_as_int(b0.min.y), - __float_as_int(b1.min.y), - __float_as_int(b0.max.y), - __float_as_int(b1.max.y)), - make_int4(__float_as_int(b0.min.z), - __float_as_int(b1.min.z), - __float_as_int(b0.max.z), - __float_as_int(b1.max.z)), - }; - - memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE); -} - -void BinaryBVH::pack_unaligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1) -{ - pack_unaligned_node(e.idx, - e0.node->get_aligned_space(), - e1.node->get_aligned_space(), - e0.node->bounds, - e1.node->bounds, - e0.encodeIdx(), e1.encodeIdx(), - e0.node->visibility, e1.node->visibility); -} - -void BinaryBVH::pack_unaligned_node(int idx, - const Transform& aligned_space0, - const Transform& aligned_space1, - const BoundBox& bounds0, - const BoundBox& bounds1, - int c0, int c1, - uint visibility0, uint visibility1) -{ - assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size()); - assert(c0 < 0 || c0 < pack.nodes.size()); - assert(c1 < 0 || c1 < pack.nodes.size()); - - float4 data[BVH_UNALIGNED_NODE_SIZE]; - Transform space0 = BVHUnaligned::compute_node_transform(bounds0, - aligned_space0); - Transform space1 = BVHUnaligned::compute_node_transform(bounds1, - aligned_space1); - data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED), - __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED), - __int_as_float(c0), - __int_as_float(c1)); - - data[1] = space0.x; - data[2] = space0.y; - data[3] = space0.z; - data[4] = space1.x; - data[5] = space1.y; - data[6] = space1.z; - - memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE); -} - -void BinaryBVH::pack_nodes(const BVHNode *root) -{ - const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT); - const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); - assert(num_leaf_nodes <= num_nodes); - const size_t num_inner_nodes = num_nodes - num_leaf_nodes; - size_t node_size; - if(params.use_unaligned_nodes) { - const size_t num_unaligned_nodes = - root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT); - node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) + - (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE; - } - else { - node_size = num_inner_nodes * BVH_NODE_SIZE; - } - /* Resize arrays */ - pack.nodes.clear(); - pack.leaf_nodes.clear(); - /* For top level BVH, first merge existing BVH's so we know the offsets. */ - if(params.top_level) { - pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE); - } - else { - pack.nodes.resize(node_size); - pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE); - } - - int nextNodeIdx = 0, nextLeafNodeIdx = 0; - - vector<BVHStackEntry> stack; - stack.reserve(BVHParams::MAX_DEPTH*2); - if(root->is_leaf()) { - stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); - } - else { - stack.push_back(BVHStackEntry(root, nextNodeIdx)); - nextNodeIdx += node_bvh_is_unaligned(root) - ? BVH_UNALIGNED_NODE_SIZE - : BVH_NODE_SIZE; - } - - while(stack.size()) { - BVHStackEntry e = stack.back(); - stack.pop_back(); - - if(e.node->is_leaf()) { - /* leaf node */ - const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); - pack_leaf(e, leaf); - } - else { - /* innner node */ - int idx[2]; - for(int i = 0; i < 2; ++i) { - if(e.node->get_child(i)->is_leaf()) { - idx[i] = nextLeafNodeIdx++; - } - else { - idx[i] = nextNodeIdx; - nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i)) - ? BVH_UNALIGNED_NODE_SIZE - : BVH_NODE_SIZE; - } - } - - stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0])); - stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1])); - - pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]); - } - } - assert(node_size == nextNodeIdx); - /* root index to start traversal at, to handle case of single leaf node */ - pack.root_index = (root->is_leaf())? -1: 0; -} - -void BinaryBVH::refit_nodes() -{ - assert(!params.top_level); - - BoundBox bbox = BoundBox::empty; - uint visibility = 0; - refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); -} - -void BinaryBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) -{ - if(leaf) { - assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); - const int4 *data = &pack.leaf_nodes[idx]; - const int c0 = data[0].x; - const int c1 = data[0].y; - /* refit leaf node */ - for(int prim = c0; prim < c1; prim++) { - int pidx = pack.prim_index[prim]; - int tob = pack.prim_object[prim]; - Object *ob = objects[tob]; - - if(pidx == -1) { - /* object instance */ - bbox.grow(ob->bounds); - } - else { - /* primitives */ - const Mesh *mesh = ob->mesh; - - if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) { - /* curves */ - int str_offset = (params.top_level)? mesh->curve_offset: 0; - Mesh::Curve curve = mesh->get_curve(pidx - str_offset); - int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]); - - curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox); - - visibility |= PATH_RAY_CURVE; - - /* motion curves */ - if(mesh->use_motion_blur) { - Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if(attr) { - size_t mesh_size = mesh->curve_keys.size(); - size_t steps = mesh->motion_steps - 1; - float3 *key_steps = attr->data_float3(); - - for(size_t i = 0; i < steps; i++) - curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox); - } - } - } - else { - /* triangles */ - int tri_offset = (params.top_level)? mesh->tri_offset: 0; - Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset); - const float3 *vpos = &mesh->verts[0]; - - triangle.bounds_grow(vpos, bbox); - - /* motion triangles */ - if(mesh->use_motion_blur) { - Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if(attr) { - size_t mesh_size = mesh->verts.size(); - size_t steps = mesh->motion_steps - 1; - float3 *vert_steps = attr->data_float3(); - - for(size_t i = 0; i < steps; i++) - triangle.bounds_grow(vert_steps + i*mesh_size, bbox); - } - } - } - } - - visibility |= ob->visibility; - } - - /* TODO(sergey): De-duplicate with pack_leaf(). */ - float4 leaf_data[BVH_NODE_LEAF_SIZE]; - leaf_data[0].x = __int_as_float(c0); - leaf_data[0].y = __int_as_float(c1); - leaf_data[0].z = __uint_as_float(visibility); - leaf_data[0].w = __uint_as_float(data[0].w); - memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE); - } - else { - assert(idx + BVH_NODE_SIZE <= pack.nodes.size()); - - const int4 *data = &pack.nodes[idx]; - const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; - const int c0 = data[0].z; - const int c1 = data[0].w; - /* refit inner node, set bbox from children */ - BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty; - uint visibility0 = 0, visibility1 = 0; - - refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0); - refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1); - - if(is_unaligned) { - Transform aligned_space = transform_identity(); - pack_unaligned_node(idx, - aligned_space, aligned_space, - bbox0, bbox1, - c0, c1, - visibility0, - visibility1); - } - else { - pack_aligned_node(idx, - bbox0, bbox1, - c0, c1, - visibility0, - visibility1); - } - - bbox.grow(bbox0); - bbox.grow(bbox1); - visibility = visibility0|visibility1; - } -} - -/* QBVH */ - -/* Can we avoid this somehow or make more generic? - * - * Perhaps we can merge nodes in actual tree and make our - * life easier all over the place. - */ -static bool node_qbvh_is_unaligned(const BVHNode *node) -{ - const BVHNode *node0 = node->get_child(0), - *node1 = node->get_child(1); - bool has_unaligned = false; - if(node0->is_leaf()) { - has_unaligned |= node0->is_unaligned; - } - else { - has_unaligned |= node0->get_child(0)->is_unaligned; - has_unaligned |= node0->get_child(1)->is_unaligned; - } - if(node1->is_leaf()) { - has_unaligned |= node1->is_unaligned; - } - else { - has_unaligned |= node1->get_child(0)->is_unaligned; - has_unaligned |= node1->get_child(1)->is_unaligned; - } - return has_unaligned; -} - -QBVH::QBVH(const BVHParams& params_, const vector<Object*>& objects_) -: BVH(params_, objects_) -{ - params.use_qbvh = true; -} - -void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf) -{ - float4 data[BVH_QNODE_LEAF_SIZE]; - memset(data, 0, sizeof(data)); - if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { - /* object */ - data[0].x = __int_as_float(~(leaf->lo)); - data[0].y = __int_as_float(0); - } - else { - /* triangle */ - data[0].x = __int_as_float(leaf->lo); - data[0].y = __int_as_float(leaf->hi); - } - data[0].z = __uint_as_float(leaf->visibility); - if(leaf->num_triangles() != 0) { - data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); - } - - memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); -} - -void QBVH::pack_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num) -{ - bool has_unaligned = false; - /* Check whether we have to create unaligned node or all nodes are aligned - * and we can cut some corner here. - */ - if(params.use_unaligned_nodes) { - for(int i = 0; i < num; i++) { - if(en[i].node->is_unaligned) { - has_unaligned = true; - break; - } - } - } - if(has_unaligned) { - /* There's no unaligned children, pack into AABB node. */ - pack_unaligned_inner(e, en, num); - } - else { - /* Create unaligned node with orientation transform for each of the - * children. - */ - pack_aligned_inner(e, en, num); - } -} - -void QBVH::pack_aligned_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num) -{ - BoundBox bounds[4]; - int child[4]; - for(int i = 0; i < num; ++i) { - bounds[i] = en[i].node->bounds; - child[i] = en[i].encodeIdx(); - } - pack_aligned_node(e.idx, - bounds, - child, - e.node->visibility, - e.node->time_from, - e.node->time_to, - num); -} - -void QBVH::pack_aligned_node(int idx, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num) -{ - float4 data[BVH_QNODE_SIZE]; - memset(data, 0, sizeof(data)); - - data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED); - data[0].y = time_from; - data[0].z = time_to; - - for(int i = 0; i < num; i++) { - float3 bb_min = bounds[i].min; - float3 bb_max = bounds[i].max; - - data[1][i] = bb_min.x; - data[2][i] = bb_max.x; - data[3][i] = bb_min.y; - data[4][i] = bb_max.y; - data[5][i] = bb_min.z; - data[6][i] = bb_max.z; - - data[7][i] = __int_as_float(child[i]); - } - - for(int i = num; i < 4; i++) { - /* We store BB which would never be recorded as intersection - * so kernel might safely assume there are always 4 child nodes. - */ - data[1][i] = FLT_MAX; - data[2][i] = -FLT_MAX; - - data[3][i] = FLT_MAX; - data[4][i] = -FLT_MAX; - - data[5][i] = FLT_MAX; - data[6][i] = -FLT_MAX; - - data[7][i] = __int_as_float(0); - } - - memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE); -} - -void QBVH::pack_unaligned_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num) -{ - Transform aligned_space[4]; - BoundBox bounds[4]; - int child[4]; - for(int i = 0; i < num; ++i) { - aligned_space[i] = en[i].node->get_aligned_space(); - bounds[i] = en[i].node->bounds; - child[i] = en[i].encodeIdx(); - } - pack_unaligned_node(e.idx, - aligned_space, - bounds, - child, - e.node->visibility, - e.node->time_from, - e.node->time_to, - num); -} - -void QBVH::pack_unaligned_node(int idx, - const Transform *aligned_space, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num) -{ - float4 data[BVH_UNALIGNED_QNODE_SIZE]; - memset(data, 0, sizeof(data)); - - data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED); - data[0].y = time_from; - data[0].z = time_to; - - for(int i = 0; i < num; i++) { - Transform space = BVHUnaligned::compute_node_transform( - bounds[i], - aligned_space[i]); - - data[1][i] = space.x.x; - data[2][i] = space.x.y; - data[3][i] = space.x.z; - - data[4][i] = space.y.x; - data[5][i] = space.y.y; - data[6][i] = space.y.z; - - data[7][i] = space.z.x; - data[8][i] = space.z.y; - data[9][i] = space.z.z; - - data[10][i] = space.x.w; - data[11][i] = space.y.w; - data[12][i] = space.z.w; - - data[13][i] = __int_as_float(child[i]); - } - - for(int i = num; i < 4; i++) { - /* We store BB which would never be recorded as intersection - * so kernel might safely assume there are always 4 child nodes. - */ - - data[1][i] = 1.0f; - data[2][i] = 0.0f; - data[3][i] = 0.0f; - - data[4][i] = 0.0f; - data[5][i] = 0.0f; - data[6][i] = 0.0f; - - data[7][i] = 0.0f; - data[8][i] = 0.0f; - data[9][i] = 0.0f; - - data[10][i] = -FLT_MAX; - data[11][i] = -FLT_MAX; - data[12][i] = -FLT_MAX; - - data[13][i] = __int_as_float(0); - } - - memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE); -} - -/* Quad SIMD Nodes */ - -void QBVH::pack_nodes(const BVHNode *root) -{ - /* Calculate size of the arrays required. */ - const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT); - const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); - assert(num_leaf_nodes <= num_nodes); - const size_t num_inner_nodes = num_nodes - num_leaf_nodes; - size_t node_size; - if(params.use_unaligned_nodes) { - const size_t num_unaligned_nodes = - root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT); - node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) + - (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE; - } - else { - node_size = num_inner_nodes * BVH_QNODE_SIZE; - } - /* Resize arrays. */ - pack.nodes.clear(); - pack.leaf_nodes.clear(); - /* For top level BVH, first merge existing BVH's so we know the offsets. */ - if(params.top_level) { - pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE); - } - else { - pack.nodes.resize(node_size); - pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE); - } - - int nextNodeIdx = 0, nextLeafNodeIdx = 0; - - vector<BVHStackEntry> stack; - stack.reserve(BVHParams::MAX_DEPTH*2); - if(root->is_leaf()) { - stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); - } - else { - stack.push_back(BVHStackEntry(root, nextNodeIdx)); - nextNodeIdx += node_qbvh_is_unaligned(root) - ? BVH_UNALIGNED_QNODE_SIZE - : BVH_QNODE_SIZE; - } - - while(stack.size()) { - BVHStackEntry e = stack.back(); - stack.pop_back(); - - if(e.node->is_leaf()) { - /* leaf node */ - const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); - pack_leaf(e, leaf); - } - else { - /* Inner node. */ - const BVHNode *node = e.node; - const BVHNode *node0 = node->get_child(0); - const BVHNode *node1 = node->get_child(1); - /* Collect nodes. */ - const BVHNode *nodes[4]; - int numnodes = 0; - if(node0->is_leaf()) { - nodes[numnodes++] = node0; - } - else { - nodes[numnodes++] = node0->get_child(0); - nodes[numnodes++] = node0->get_child(1); - } - if(node1->is_leaf()) { - nodes[numnodes++] = node1; - } - else { - nodes[numnodes++] = node1->get_child(0); - nodes[numnodes++] = node1->get_child(1); - } - /* Push entries on the stack. */ - for(int i = 0; i < numnodes; ++i) { - int idx; - if(nodes[i]->is_leaf()) { - idx = nextLeafNodeIdx++; - } - else { - idx = nextNodeIdx; - nextNodeIdx += node_qbvh_is_unaligned(nodes[i]) - ? BVH_UNALIGNED_QNODE_SIZE - : BVH_QNODE_SIZE; - } - stack.push_back(BVHStackEntry(nodes[i], idx)); - } - /* Set node. */ - pack_inner(e, &stack[stack.size()-numnodes], numnodes); - } - } - assert(node_size == nextNodeIdx); - /* Root index to start traversal at, to handle case of single leaf node. */ - pack.root_index = (root->is_leaf())? -1: 0; -} - -void QBVH::refit_nodes() -{ - assert(!params.top_level); - - BoundBox bbox = BoundBox::empty; - uint visibility = 0; - refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); -} - -void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) -{ - if(leaf) { - int4 *data = &pack.leaf_nodes[idx]; - int4 c = data[0]; - /* Refit leaf node. */ - for(int prim = c.x; prim < c.y; prim++) { - int pidx = pack.prim_index[prim]; - int tob = pack.prim_object[prim]; - Object *ob = objects[tob]; - - if(pidx == -1) { - /* Object instance. */ - bbox.grow(ob->bounds); - } - else { - /* Primitives. */ - const Mesh *mesh = ob->mesh; - - if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) { - /* Curves. */ - int str_offset = (params.top_level)? mesh->curve_offset: 0; - Mesh::Curve curve = mesh->get_curve(pidx - str_offset); - int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]); - - curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox); - - visibility |= PATH_RAY_CURVE; - - /* Motion curves. */ - if(mesh->use_motion_blur) { - Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if(attr) { - size_t mesh_size = mesh->curve_keys.size(); - size_t steps = mesh->motion_steps - 1; - float3 *key_steps = attr->data_float3(); - - for(size_t i = 0; i < steps; i++) - curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox); - } - } - } - else { - /* Triangles. */ - int tri_offset = (params.top_level)? mesh->tri_offset: 0; - Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset); - const float3 *vpos = &mesh->verts[0]; - - triangle.bounds_grow(vpos, bbox); - - /* Motion triangles. */ - if(mesh->use_motion_blur) { - Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if(attr) { - size_t mesh_size = mesh->verts.size(); - size_t steps = mesh->motion_steps - 1; - float3 *vert_steps = attr->data_float3(); - - for(size_t i = 0; i < steps; i++) - triangle.bounds_grow(vert_steps + i*mesh_size, bbox); - } - } - } - } - - visibility |= ob->visibility; - } - - /* TODO(sergey): This is actually a copy of pack_leaf(), - * but this chunk of code only knows actual data and has - * no idea about BVHNode. - * - * Would be nice to de-duplicate code, but trying to make - * making code more general ends up in much nastier code - * in my opinion so far. - * - * Same applies to the inner nodes case below. - */ - float4 leaf_data[BVH_QNODE_LEAF_SIZE]; - leaf_data[0].x = __int_as_float(c.x); - leaf_data[0].y = __int_as_float(c.y); - leaf_data[0].z = __uint_as_float(visibility); - leaf_data[0].w = __uint_as_float(c.w); - memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); - } - else { - int4 *data = &pack.nodes[idx]; - bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; - int4 c; - if(is_unaligned) { - c = data[13]; - } - else { - c = data[7]; - } - /* Refit inner node, set bbox from children. */ - BoundBox child_bbox[4] = {BoundBox::empty, - BoundBox::empty, - BoundBox::empty, - BoundBox::empty}; - uint child_visibility[4] = {0}; - int num_nodes = 0; - - for(int i = 0; i < 4; ++i) { - if(c[i] != 0) { - refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0), - child_bbox[i], child_visibility[i]); - ++num_nodes; - bbox.grow(child_bbox[i]); - visibility |= child_visibility[i]; - } - } - - if(is_unaligned) { - Transform aligned_space[4] = {transform_identity(), - transform_identity(), - transform_identity(), - transform_identity()}; - pack_unaligned_node(idx, - aligned_space, - child_bbox, - &c[0], - visibility, - 0.0f, - 1.0f, - 4); - } - else { - pack_aligned_node(idx, - child_bbox, - &c[0], - visibility, - 0.0f, - 1.0f, - 4); - } - } -} - CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h index 60bc62ee6e4..7bac6112fd9 100644 --- a/intern/cycles/bvh/bvh.h +++ b/intern/cycles/bvh/bvh.h @@ -33,15 +33,8 @@ class LeafNode; class Object; class Progress; -#define BVH_NODE_SIZE 4 -#define BVH_NODE_LEAF_SIZE 1 -#define BVH_QNODE_SIZE 8 -#define BVH_QNODE_LEAF_SIZE 1 -#define BVH_ALIGN 4096 -#define TRI_NODE_SIZE 3 - -#define BVH_UNALIGNED_NODE_SIZE 7 -#define BVH_UNALIGNED_QNODE_SIZE 14 +#define BVH_ALIGN 4096 +#define TRI_NODE_SIZE 3 /* Packed BVH * @@ -54,7 +47,7 @@ struct PackedBVH { /* BVH leaf nodes storage. */ array<int4> leaf_nodes; /* object index to BVH node index mapping for instances */ - array<int> object_node; + array<int> object_node; /* Mapping from primitive index to index in triangle array. */ array<uint> prim_tri_index; /* Continuous storage of triangle vertices. */ @@ -110,95 +103,16 @@ protected: virtual void refit_nodes() = 0; }; -/* Binary BVH - * - * Typical BVH with each node having two children. */ - -class BinaryBVH : public BVH { -protected: - /* constructor */ - friend class BVH; - BinaryBVH(const BVHParams& params, const vector<Object*>& objects); - - /* pack */ - void pack_nodes(const BVHNode *root); - - void pack_leaf(const BVHStackEntry& e, - const LeafNode *leaf); - void pack_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1); - - void pack_aligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1); - void pack_aligned_node(int idx, - const BoundBox& b0, - const BoundBox& b1, - int c0, int c1, - uint visibility0, uint visibility1); - - void pack_unaligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1); - void pack_unaligned_node(int idx, - const Transform& aligned_space0, - const Transform& aligned_space1, - const BoundBox& b0, - const BoundBox& b1, - int c0, int c1, - uint visibility0, uint visibility1); - - /* refit */ - void refit_nodes(); - void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); -}; - -/* QBVH - * - * Quad BVH, with each node having four children, to use with SIMD instructions. */ +/* Pack Utility */ +struct BVHStackEntry +{ + const BVHNode *node; + int idx; -class QBVH : public BVH { -protected: - /* constructor */ - friend class BVH; - QBVH(const BVHParams& params, const vector<Object*>& objects); - - /* pack */ - void pack_nodes(const BVHNode *root); - - void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf); - void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num); - - void pack_aligned_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num); - void pack_aligned_node(int idx, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num); - - void pack_unaligned_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num); - void pack_unaligned_node(int idx, - const Transform *aligned_space, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num); - - /* refit */ - void refit_nodes(); - void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); + BVHStackEntry(const BVHNode *n = 0, int i = 0); + int encodeIdx() const; }; CCL_NAMESPACE_END #endif /* __BVH_H__ */ - diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp new file mode 100644 index 00000000000..340ba7dcf53 --- /dev/null +++ b/intern/cycles/bvh/bvh2.cpp @@ -0,0 +1,364 @@ +/* + * Adapted from code copyright 2009-2010 NVIDIA Corporation + * Modifications Copyright 2011, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bvh/bvh2.h" + +#include "render/mesh.h" +#include "render/object.h" + +#include "bvh/bvh_node.h" +#include "bvh/bvh_unaligned.h" + +CCL_NAMESPACE_BEGIN + +static bool node_bvh_is_unaligned(const BVHNode *node) +{ + const BVHNode *node0 = node->get_child(0), + *node1 = node->get_child(1); + return node0->is_unaligned || node1->is_unaligned; +} + +BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_) +: BVH(params_, objects_) +{ +} + +void BVH2::pack_leaf(const BVHStackEntry& e, + const LeafNode *leaf) +{ + assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); + float4 data[BVH_NODE_LEAF_SIZE]; + memset(data, 0, sizeof(data)); + if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { + /* object */ + data[0].x = __int_as_float(~(leaf->lo)); + data[0].y = __int_as_float(0); + } + else { + /* triangle */ + data[0].x = __int_as_float(leaf->lo); + data[0].y = __int_as_float(leaf->hi); + } + data[0].z = __uint_as_float(leaf->visibility); + if(leaf->num_triangles() != 0) { + data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); + } + + memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE); +} + +void BVH2::pack_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) +{ + if(e0.node->is_unaligned || e1.node->is_unaligned) { + pack_unaligned_inner(e, e0, e1); + } else { + pack_aligned_inner(e, e0, e1); + } +} + +void BVH2::pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) +{ + pack_aligned_node(e.idx, + e0.node->bounds, e1.node->bounds, + e0.encodeIdx(), e1.encodeIdx(), + e0.node->visibility, e1.node->visibility); +} + +void BVH2::pack_aligned_node(int idx, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1) +{ + assert(idx + BVH_NODE_SIZE <= pack.nodes.size()); + assert(c0 < 0 || c0 < pack.nodes.size()); + assert(c1 < 0 || c1 < pack.nodes.size()); + + int4 data[BVH_NODE_SIZE] = { + make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED, + visibility1 & ~PATH_RAY_NODE_UNALIGNED, + c0, c1), + make_int4(__float_as_int(b0.min.x), + __float_as_int(b1.min.x), + __float_as_int(b0.max.x), + __float_as_int(b1.max.x)), + make_int4(__float_as_int(b0.min.y), + __float_as_int(b1.min.y), + __float_as_int(b0.max.y), + __float_as_int(b1.max.y)), + make_int4(__float_as_int(b0.min.z), + __float_as_int(b1.min.z), + __float_as_int(b0.max.z), + __float_as_int(b1.max.z)), + }; + + memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE); +} + +void BVH2::pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) +{ + pack_unaligned_node(e.idx, + e0.node->get_aligned_space(), + e1.node->get_aligned_space(), + e0.node->bounds, + e1.node->bounds, + e0.encodeIdx(), e1.encodeIdx(), + e0.node->visibility, e1.node->visibility); +} + +void BVH2::pack_unaligned_node(int idx, + const Transform& aligned_space0, + const Transform& aligned_space1, + const BoundBox& bounds0, + const BoundBox& bounds1, + int c0, int c1, + uint visibility0, uint visibility1) +{ + assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size()); + assert(c0 < 0 || c0 < pack.nodes.size()); + assert(c1 < 0 || c1 < pack.nodes.size()); + + float4 data[BVH_UNALIGNED_NODE_SIZE]; + Transform space0 = BVHUnaligned::compute_node_transform(bounds0, + aligned_space0); + Transform space1 = BVHUnaligned::compute_node_transform(bounds1, + aligned_space1); + data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED), + __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED), + __int_as_float(c0), + __int_as_float(c1)); + + data[1] = space0.x; + data[2] = space0.y; + data[3] = space0.z; + data[4] = space1.x; + data[5] = space1.y; + data[6] = space1.z; + + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE); +} + +void BVH2::pack_nodes(const BVHNode *root) +{ + const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT); + const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); + assert(num_leaf_nodes <= num_nodes); + const size_t num_inner_nodes = num_nodes - num_leaf_nodes; + size_t node_size; + if(params.use_unaligned_nodes) { + const size_t num_unaligned_nodes = + root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT); + node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) + + (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE; + } + else { + node_size = num_inner_nodes * BVH_NODE_SIZE; + } + /* Resize arrays */ + pack.nodes.clear(); + pack.leaf_nodes.clear(); + /* For top level BVH, first merge existing BVH's so we know the offsets. */ + if(params.top_level) { + pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE); + } + else { + pack.nodes.resize(node_size); + pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE); + } + + int nextNodeIdx = 0, nextLeafNodeIdx = 0; + + vector<BVHStackEntry> stack; + stack.reserve(BVHParams::MAX_DEPTH*2); + if(root->is_leaf()) { + stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); + } + else { + stack.push_back(BVHStackEntry(root, nextNodeIdx)); + nextNodeIdx += node_bvh_is_unaligned(root) + ? BVH_UNALIGNED_NODE_SIZE + : BVH_NODE_SIZE; + } + + while(stack.size()) { + BVHStackEntry e = stack.back(); + stack.pop_back(); + + if(e.node->is_leaf()) { + /* leaf node */ + const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); + pack_leaf(e, leaf); + } + else { + /* innner node */ + int idx[2]; + for(int i = 0; i < 2; ++i) { + if(e.node->get_child(i)->is_leaf()) { + idx[i] = nextLeafNodeIdx++; + } + else { + idx[i] = nextNodeIdx; + nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i)) + ? BVH_UNALIGNED_NODE_SIZE + : BVH_NODE_SIZE; + } + } + + stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0])); + stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1])); + + pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]); + } + } + assert(node_size == nextNodeIdx); + /* root index to start traversal at, to handle case of single leaf node */ + pack.root_index = (root->is_leaf())? -1: 0; +} + +void BVH2::refit_nodes() +{ + assert(!params.top_level); + + BoundBox bbox = BoundBox::empty; + uint visibility = 0; + refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); +} + +void BVH2::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) +{ + if(leaf) { + assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); + const int4 *data = &pack.leaf_nodes[idx]; + const int c0 = data[0].x; + const int c1 = data[0].y; + /* refit leaf node */ + for(int prim = c0; prim < c1; prim++) { + int pidx = pack.prim_index[prim]; + int tob = pack.prim_object[prim]; + Object *ob = objects[tob]; + + if(pidx == -1) { + /* object instance */ + bbox.grow(ob->bounds); + } + else { + /* primitives */ + const Mesh *mesh = ob->mesh; + + if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) { + /* curves */ + int str_offset = (params.top_level)? mesh->curve_offset: 0; + Mesh::Curve curve = mesh->get_curve(pidx - str_offset); + int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]); + + curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox); + + visibility |= PATH_RAY_CURVE; + + /* motion curves */ + if(mesh->use_motion_blur) { + Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + + if(attr) { + size_t mesh_size = mesh->curve_keys.size(); + size_t steps = mesh->motion_steps - 1; + float3 *key_steps = attr->data_float3(); + + for(size_t i = 0; i < steps; i++) + curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox); + } + } + } + else { + /* triangles */ + int tri_offset = (params.top_level)? mesh->tri_offset: 0; + Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset); + const float3 *vpos = &mesh->verts[0]; + + triangle.bounds_grow(vpos, bbox); + + /* motion triangles */ + if(mesh->use_motion_blur) { + Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + + if(attr) { + size_t mesh_size = mesh->verts.size(); + size_t steps = mesh->motion_steps - 1; + float3 *vert_steps = attr->data_float3(); + + for(size_t i = 0; i < steps; i++) + triangle.bounds_grow(vert_steps + i*mesh_size, bbox); + } + } + } + } + + visibility |= ob->visibility; + } + + /* TODO(sergey): De-duplicate with pack_leaf(). */ + float4 leaf_data[BVH_NODE_LEAF_SIZE]; + leaf_data[0].x = __int_as_float(c0); + leaf_data[0].y = __int_as_float(c1); + leaf_data[0].z = __uint_as_float(visibility); + leaf_data[0].w = __uint_as_float(data[0].w); + memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE); + } + else { + assert(idx + BVH_NODE_SIZE <= pack.nodes.size()); + + const int4 *data = &pack.nodes[idx]; + const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; + const int c0 = data[0].z; + const int c1 = data[0].w; + /* refit inner node, set bbox from children */ + BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty; + uint visibility0 = 0, visibility1 = 0; + + refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0); + refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1); + + if(is_unaligned) { + Transform aligned_space = transform_identity(); + pack_unaligned_node(idx, + aligned_space, aligned_space, + bbox0, bbox1, + c0, c1, + visibility0, + visibility1); + } + else { + pack_aligned_node(idx, + bbox0, bbox1, + c0, c1, + visibility0, + visibility1); + } + + bbox.grow(bbox0); + bbox.grow(bbox1); + visibility = visibility0|visibility1; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh2.h b/intern/cycles/bvh/bvh2.h new file mode 100644 index 00000000000..df65ddca5b7 --- /dev/null +++ b/intern/cycles/bvh/bvh2.h @@ -0,0 +1,87 @@ +/* + * Adapted from code copyright 2009-2010 NVIDIA Corporation + * Modifications Copyright 2011, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BVH2_H__ +#define __BVH2_H__ + +#include "bvh/bvh.h" +#include "bvh/bvh_params.h" + +#include "util/util_types.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class BVHNode; +struct BVHStackEntry; +class BVHParams; +class BoundBox; +class LeafNode; +class Object; +class Progress; + +#define BVH_NODE_SIZE 4 +#define BVH_NODE_LEAF_SIZE 1 +#define BVH_UNALIGNED_NODE_SIZE 7 + +/* BVH2 + * + * Typical BVH with each node having two children. + */ +class BVH2 : public BVH { +protected: + /* constructor */ + friend class BVH; + BVH2(const BVHParams& params, const vector<Object*>& objects); + + /* pack */ + void pack_nodes(const BVHNode *root); + + void pack_leaf(const BVHStackEntry& e, + const LeafNode *leaf); + void pack_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1); + + void pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1); + void pack_aligned_node(int idx, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1); + + void pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1); + void pack_unaligned_node(int idx, + const Transform& aligned_space0, + const Transform& aligned_space1, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1); + + /* refit */ + void refit_nodes(); + void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); +}; + +CCL_NAMESPACE_END + +#endif /* __BVH2_H__ */ diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp new file mode 100644 index 00000000000..5034ab811d5 --- /dev/null +++ b/intern/cycles/bvh/bvh4.cpp @@ -0,0 +1,516 @@ +/* + * Adapted from code copyright 2009-2010 NVIDIA Corporation + * Modifications Copyright 2011, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bvh/bvh4.h" + +#include "render/mesh.h" +#include "render/object.h" + +#include "bvh/bvh_node.h" +#include "bvh/bvh_unaligned.h" + +CCL_NAMESPACE_BEGIN + +/* Can we avoid this somehow or make more generic? + * + * Perhaps we can merge nodes in actual tree and make our + * life easier all over the place. + */ +static bool node_qbvh_is_unaligned(const BVHNode *node) +{ + const BVHNode *node0 = node->get_child(0), + *node1 = node->get_child(1); + bool has_unaligned = false; + if(node0->is_leaf()) { + has_unaligned |= node0->is_unaligned; + } + else { + has_unaligned |= node0->get_child(0)->is_unaligned; + has_unaligned |= node0->get_child(1)->is_unaligned; + } + if(node1->is_leaf()) { + has_unaligned |= node1->is_unaligned; + } + else { + has_unaligned |= node1->get_child(0)->is_unaligned; + has_unaligned |= node1->get_child(1)->is_unaligned; + } + return has_unaligned; +} + +BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_) +: BVH(params_, objects_) +{ + params.use_qbvh = true; +} + +void BVH4::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf) +{ + float4 data[BVH_QNODE_LEAF_SIZE]; + memset(data, 0, sizeof(data)); + if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { + /* object */ + data[0].x = __int_as_float(~(leaf->lo)); + data[0].y = __int_as_float(0); + } + else { + /* triangle */ + data[0].x = __int_as_float(leaf->lo); + data[0].y = __int_as_float(leaf->hi); + } + data[0].z = __uint_as_float(leaf->visibility); + if(leaf->num_triangles() != 0) { + data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); + } + + memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); +} + +void BVH4::pack_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + bool has_unaligned = false; + /* Check whether we have to create unaligned node or all nodes are aligned + * and we can cut some corner here. + */ + if(params.use_unaligned_nodes) { + for(int i = 0; i < num; i++) { + if(en[i].node->is_unaligned) { + has_unaligned = true; + break; + } + } + } + if(has_unaligned) { + /* There's no unaligned children, pack into AABB node. */ + pack_unaligned_inner(e, en, num); + } + else { + /* Create unaligned node with orientation transform for each of the + * children. + */ + pack_aligned_inner(e, en, num); + } +} + +void BVH4::pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + BoundBox bounds[4]; + int child[4]; + for(int i = 0; i < num; ++i) { + bounds[i] = en[i].node->bounds; + child[i] = en[i].encodeIdx(); + } + pack_aligned_node(e.idx, + bounds, + child, + e.node->visibility, + e.node->time_from, + e.node->time_to, + num); +} + +void BVH4::pack_aligned_node(int idx, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num) +{ + float4 data[BVH_QNODE_SIZE]; + memset(data, 0, sizeof(data)); + + data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED); + data[0].y = time_from; + data[0].z = time_to; + + for(int i = 0; i < num; i++) { + float3 bb_min = bounds[i].min; + float3 bb_max = bounds[i].max; + + data[1][i] = bb_min.x; + data[2][i] = bb_max.x; + data[3][i] = bb_min.y; + data[4][i] = bb_max.y; + data[5][i] = bb_min.z; + data[6][i] = bb_max.z; + + data[7][i] = __int_as_float(child[i]); + } + + for(int i = num; i < 4; i++) { + /* We store BB which would never be recorded as intersection + * so kernel might safely assume there are always 4 child nodes. + */ + data[1][i] = FLT_MAX; + data[2][i] = -FLT_MAX; + + data[3][i] = FLT_MAX; + data[4][i] = -FLT_MAX; + + data[5][i] = FLT_MAX; + data[6][i] = -FLT_MAX; + + data[7][i] = __int_as_float(0); + } + + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE); +} + +void BVH4::pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + Transform aligned_space[4]; + BoundBox bounds[4]; + int child[4]; + for(int i = 0; i < num; ++i) { + aligned_space[i] = en[i].node->get_aligned_space(); + bounds[i] = en[i].node->bounds; + child[i] = en[i].encodeIdx(); + } + pack_unaligned_node(e.idx, + aligned_space, + bounds, + child, + e.node->visibility, + e.node->time_from, + e.node->time_to, + num); +} + +void BVH4::pack_unaligned_node(int idx, + const Transform *aligned_space, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num) +{ + float4 data[BVH_UNALIGNED_QNODE_SIZE]; + memset(data, 0, sizeof(data)); + + data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED); + data[0].y = time_from; + data[0].z = time_to; + + for(int i = 0; i < num; i++) { + Transform space = BVHUnaligned::compute_node_transform( + bounds[i], + aligned_space[i]); + + data[1][i] = space.x.x; + data[2][i] = space.x.y; + data[3][i] = space.x.z; + + data[4][i] = space.y.x; + data[5][i] = space.y.y; + data[6][i] = space.y.z; + + data[7][i] = space.z.x; + data[8][i] = space.z.y; + data[9][i] = space.z.z; + + data[10][i] = space.x.w; + data[11][i] = space.y.w; + data[12][i] = space.z.w; + + data[13][i] = __int_as_float(child[i]); + } + + for(int i = num; i < 4; i++) { + /* We store BB which would never be recorded as intersection + * so kernel might safely assume there are always 4 child nodes. + */ + + data[1][i] = 1.0f; + data[2][i] = 0.0f; + data[3][i] = 0.0f; + + data[4][i] = 0.0f; + data[5][i] = 0.0f; + data[6][i] = 0.0f; + + data[7][i] = 0.0f; + data[8][i] = 0.0f; + data[9][i] = 0.0f; + + data[10][i] = -FLT_MAX; + data[11][i] = -FLT_MAX; + data[12][i] = -FLT_MAX; + + data[13][i] = __int_as_float(0); + } + + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE); +} + +/* Quad SIMD Nodes */ + +void BVH4::pack_nodes(const BVHNode *root) +{ + /* Calculate size of the arrays required. */ + const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT); + const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); + assert(num_leaf_nodes <= num_nodes); + const size_t num_inner_nodes = num_nodes - num_leaf_nodes; + size_t node_size; + if(params.use_unaligned_nodes) { + const size_t num_unaligned_nodes = + root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT); + node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) + + (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE; + } + else { + node_size = num_inner_nodes * BVH_QNODE_SIZE; + } + /* Resize arrays. */ + pack.nodes.clear(); + pack.leaf_nodes.clear(); + /* For top level BVH, first merge existing BVH's so we know the offsets. */ + if(params.top_level) { + pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE); + } + else { + pack.nodes.resize(node_size); + pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE); + } + + int nextNodeIdx = 0, nextLeafNodeIdx = 0; + + vector<BVHStackEntry> stack; + stack.reserve(BVHParams::MAX_DEPTH*2); + if(root->is_leaf()) { + stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); + } + else { + stack.push_back(BVHStackEntry(root, nextNodeIdx)); + nextNodeIdx += node_qbvh_is_unaligned(root) + ? BVH_UNALIGNED_QNODE_SIZE + : BVH_QNODE_SIZE; + } + + while(stack.size()) { + BVHStackEntry e = stack.back(); + stack.pop_back(); + + if(e.node->is_leaf()) { + /* leaf node */ + const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); + pack_leaf(e, leaf); + } + else { + /* Inner node. */ + const BVHNode *node = e.node; + const BVHNode *node0 = node->get_child(0); + const BVHNode *node1 = node->get_child(1); + /* Collect nodes. */ + const BVHNode *nodes[4]; + int numnodes = 0; + if(node0->is_leaf()) { + nodes[numnodes++] = node0; + } + else { + nodes[numnodes++] = node0->get_child(0); + nodes[numnodes++] = node0->get_child(1); + } + if(node1->is_leaf()) { + nodes[numnodes++] = node1; + } + else { + nodes[numnodes++] = node1->get_child(0); + nodes[numnodes++] = node1->get_child(1); + } + /* Push entries on the stack. */ + for(int i = 0; i < numnodes; ++i) { + int idx; + if(nodes[i]->is_leaf()) { + idx = nextLeafNodeIdx++; + } + else { + idx = nextNodeIdx; + nextNodeIdx += node_qbvh_is_unaligned(nodes[i]) + ? BVH_UNALIGNED_QNODE_SIZE + : BVH_QNODE_SIZE; + } + stack.push_back(BVHStackEntry(nodes[i], idx)); + } + /* Set node. */ + pack_inner(e, &stack[stack.size()-numnodes], numnodes); + } + } + assert(node_size == nextNodeIdx); + /* Root index to start traversal at, to handle case of single leaf node. */ + pack.root_index = (root->is_leaf())? -1: 0; +} + +void BVH4::refit_nodes() +{ + assert(!params.top_level); + + BoundBox bbox = BoundBox::empty; + uint visibility = 0; + refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); +} + +void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) +{ + if(leaf) { + int4 *data = &pack.leaf_nodes[idx]; + int4 c = data[0]; + /* Refit leaf node. */ + for(int prim = c.x; prim < c.y; prim++) { + int pidx = pack.prim_index[prim]; + int tob = pack.prim_object[prim]; + Object *ob = objects[tob]; + + if(pidx == -1) { + /* Object instance. */ + bbox.grow(ob->bounds); + } + else { + /* Primitives. */ + const Mesh *mesh = ob->mesh; + + if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) { + /* Curves. */ + int str_offset = (params.top_level)? mesh->curve_offset: 0; + Mesh::Curve curve = mesh->get_curve(pidx - str_offset); + int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]); + + curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox); + + visibility |= PATH_RAY_CURVE; + + /* Motion curves. */ + if(mesh->use_motion_blur) { + Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + + if(attr) { + size_t mesh_size = mesh->curve_keys.size(); + size_t steps = mesh->motion_steps - 1; + float3 *key_steps = attr->data_float3(); + + for(size_t i = 0; i < steps; i++) + curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox); + } + } + } + else { + /* Triangles. */ + int tri_offset = (params.top_level)? mesh->tri_offset: 0; + Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset); + const float3 *vpos = &mesh->verts[0]; + + triangle.bounds_grow(vpos, bbox); + + /* Motion triangles. */ + if(mesh->use_motion_blur) { + Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + + if(attr) { + size_t mesh_size = mesh->verts.size(); + size_t steps = mesh->motion_steps - 1; + float3 *vert_steps = attr->data_float3(); + + for(size_t i = 0; i < steps; i++) + triangle.bounds_grow(vert_steps + i*mesh_size, bbox); + } + } + } + } + + visibility |= ob->visibility; + } + + /* TODO(sergey): This is actually a copy of pack_leaf(), + * but this chunk of code only knows actual data and has + * no idea about BVHNode. + * + * Would be nice to de-duplicate code, but trying to make + * making code more general ends up in much nastier code + * in my opinion so far. + * + * Same applies to the inner nodes case below. + */ + float4 leaf_data[BVH_QNODE_LEAF_SIZE]; + leaf_data[0].x = __int_as_float(c.x); + leaf_data[0].y = __int_as_float(c.y); + leaf_data[0].z = __uint_as_float(visibility); + leaf_data[0].w = __uint_as_float(c.w); + memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); + } + else { + int4 *data = &pack.nodes[idx]; + bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; + int4 c; + if(is_unaligned) { + c = data[13]; + } + else { + c = data[7]; + } + /* Refit inner node, set bbox from children. */ + BoundBox child_bbox[4] = {BoundBox::empty, + BoundBox::empty, + BoundBox::empty, + BoundBox::empty}; + uint child_visibility[4] = {0}; + int num_nodes = 0; + + for(int i = 0; i < 4; ++i) { + if(c[i] != 0) { + refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0), + child_bbox[i], child_visibility[i]); + ++num_nodes; + bbox.grow(child_bbox[i]); + visibility |= child_visibility[i]; + } + } + + if(is_unaligned) { + Transform aligned_space[4] = {transform_identity(), + transform_identity(), + transform_identity(), + transform_identity()}; + pack_unaligned_node(idx, + aligned_space, + child_bbox, + &c[0], + visibility, + 0.0f, + 1.0f, + 4); + } + else { + pack_aligned_node(idx, + child_bbox, + &c[0], + visibility, + 0.0f, + 1.0f, + 4); + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h new file mode 100644 index 00000000000..310909a37e1 --- /dev/null +++ b/intern/cycles/bvh/bvh4.h @@ -0,0 +1,87 @@ +/* + * Adapted from code copyright 2009-2010 NVIDIA Corporation + * Modifications Copyright 2011, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BVH4_H__ +#define __BVH4_H__ + +#include "bvh/bvh.h" +#include "bvh/bvh_params.h" + +#include "util/util_types.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class BVHNode; +struct BVHStackEntry; +class BVHParams; +class BoundBox; +class LeafNode; +class Object; +class Progress; + +#define BVH_QNODE_SIZE 8 +#define BVH_QNODE_LEAF_SIZE 1 +#define BVH_UNALIGNED_QNODE_SIZE 14 + +/* BVH4 + * + * Quad BVH, with each node having four children, to use with SIMD instructions. + */ +class BVH4 : public BVH { +protected: + /* constructor */ + friend class BVH; + BVH4(const BVHParams& params, const vector<Object*>& objects); + + /* pack */ + void pack_nodes(const BVHNode *root); + + void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf); + void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num); + + void pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num); + void pack_aligned_node(int idx, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num); + + void pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num); + void pack_unaligned_node(int idx, + const Transform *aligned_space, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num); + + /* refit */ + void refit_nodes(); + void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); +}; + +CCL_NAMESPACE_END + +#endif /* __BVH4_H__ */ diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp index 3226008f511..63a7fc11668 100644 --- a/intern/cycles/bvh/bvh_binning.cpp +++ b/intern/cycles/bvh/bvh_binning.cpp @@ -17,10 +17,10 @@ //#define __KERNEL_SSE__ -#include <stdlib.h> - #include "bvh/bvh_binning.h" +#include <stdlib.h> + #include "util/util_algorithm.h" #include "util/util_boundbox.h" #include "util/util_types.h" diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h index 285f9c56a62..c2e259b1696 100644 --- a/intern/cycles/bvh/bvh_binning.h +++ b/intern/cycles/bvh/bvh_binning.h @@ -111,5 +111,4 @@ protected: CCL_NAMESPACE_END -#endif - +#endif /* __BVH_BINNING_H__ */ diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp index 95c71b54da0..1880964355c 100644 --- a/intern/cycles/bvh/bvh_build.cpp +++ b/intern/cycles/bvh/bvh_build.cpp @@ -15,8 +15,9 @@ * limitations under the License. */ -#include "bvh/bvh_binning.h" #include "bvh/bvh_build.h" + +#include "bvh/bvh_binning.h" #include "bvh/bvh_node.h" #include "bvh/bvh_params.h" #include "bvh_split.h" diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h index 5733708050d..7b245139819 100644 --- a/intern/cycles/bvh/bvh_build.h +++ b/intern/cycles/bvh/bvh_build.h @@ -20,17 +20,17 @@ #include <float.h> -#include "bvh/bvh.h" -#include "bvh/bvh_binning.h" +#include "bvh/bvh_params.h" #include "bvh/bvh_unaligned.h" -#include "util/util_boundbox.h" #include "util/util_task.h" #include "util/util_vector.h" CCL_NAMESPACE_BEGIN +class Boundbox; class BVHBuildTask; +class BVHNode; class BVHSpatialSplitBuildTask; class BVHParams; class InnerNode; diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp index 4f788c66797..4237c62ab5b 100644 --- a/intern/cycles/bvh/bvh_node.cpp +++ b/intern/cycles/bvh/bvh_node.cpp @@ -15,9 +15,10 @@ * limitations under the License. */ +#include "bvh/bvh_node.h" + #include "bvh/bvh.h" #include "bvh/bvh_build.h" -#include "bvh/bvh_node.h" #include "util/util_debug.h" #include "util/util_vector.h" diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h index 60511b4b012..1c875f5a524 100644 --- a/intern/cycles/bvh/bvh_node.h +++ b/intern/cycles/bvh/bvh_node.h @@ -19,7 +19,6 @@ #define __BVH_NODE_H__ #include "util/util_boundbox.h" -#include "util/util_debug.h" #include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h index 9795a7a4350..7dd699b33a4 100644 --- a/intern/cycles/bvh/bvh_params.h +++ b/intern/cycles/bvh/bvh_params.h @@ -246,4 +246,3 @@ struct BVHSpatialStorage { CCL_NAMESPACE_END #endif /* __BVH_PARAMS_H__ */ - diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp index d29629c0279..3a01061b285 100644 --- a/intern/cycles/bvh/bvh_sort.cpp +++ b/intern/cycles/bvh/bvh_sort.cpp @@ -15,9 +15,10 @@ * limitations under the License. */ -#include "bvh/bvh_build.h" #include "bvh/bvh_sort.h" +#include "bvh/bvh_build.h" + #include "util/util_algorithm.h" #include "util/util_debug.h" #include "util/util_task.h" diff --git a/intern/cycles/bvh/bvh_sort.h b/intern/cycles/bvh/bvh_sort.h index b49ca02eb60..936401d8607 100644 --- a/intern/cycles/bvh/bvh_sort.h +++ b/intern/cycles/bvh/bvh_sort.h @@ -18,8 +18,11 @@ #ifndef __BVH_SORT_H__ #define __BVH_SORT_H__ +#include <cstddef> + CCL_NAMESPACE_BEGIN +class BVHReference; class BVHUnaligned; struct Transform; @@ -33,4 +36,3 @@ void bvh_reference_sort(int start, CCL_NAMESPACE_END #endif /* __BVH_SORT_H__ */ - diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp index b10d69a495d..c55ba40b565 100644 --- a/intern/cycles/bvh/bvh_split.cpp +++ b/intern/cycles/bvh/bvh_split.cpp @@ -15,8 +15,9 @@ * limitations under the License. */ -#include "bvh/bvh_build.h" #include "bvh/bvh_split.h" + +#include "bvh/bvh_build.h" #include "bvh/bvh_sort.h" #include "render/mesh.h" diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp index ef227d20ea9..b522a8f3e10 100644 --- a/intern/cycles/bvh/bvh_unaligned.cpp +++ b/intern/cycles/bvh/bvh_unaligned.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ - #include "bvh/bvh_unaligned.h" #include "render/mesh.h" diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h index f41bae79e2b..c3ece051cd5 100644 --- a/intern/cycles/bvh/bvh_unaligned.h +++ b/intern/cycles/bvh/bvh_unaligned.h @@ -78,4 +78,3 @@ protected: CCL_NAMESPACE_END #endif /* __BVH_UNALIGNED_H__ */ - diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index 403a0540963..df88b91f5ac 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -135,13 +135,5 @@ if(CYCLES_STANDALONE_REPOSITORY) unset(_lib_DIR) else() - if(WIN32) - set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src/windows) - set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src) - else() - set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src) - set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src) - endif() - set(GFLAGS_NAMESPACE "gflags") set(LLVM_LIBRARIES ${LLVM_LIBRARY}) endif() diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 6ef2aa1caad..74ec57ddf74 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -25,6 +25,7 @@ set(SRC device.cpp device_cpu.cpp device_cuda.cpp + device_denoising.cpp device_multi.cpp device_opencl.cpp device_split_kernel.cpp @@ -48,6 +49,7 @@ endif() set(SRC_HEADERS device.h + device_denoising.h device_memory.h device_intern.h device_network.h diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 968af447e29..a54bb77f9f3 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -66,6 +66,10 @@ std::ostream& operator <<(std::ostream &os, << string_from_bool(requested_features.use_patch_evaluation) << std::endl; os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent) << std::endl; + os << "Use Principled BSDF: " + << string_from_bool(requested_features.use_principled) << std::endl; + os << "Use Denoising: " + << string_from_bool(requested_features.use_denoising) << std::endl; return os; } @@ -400,4 +404,16 @@ void Device::free_memory() devices.free_memory(); } + +device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type) + : device(device) +{ + ptr = device->mem_alloc_sub_ptr(mem, offset, size, type); +} + +device_sub_ptr::~device_sub_ptr() +{ + device->mem_free_sub_ptr(ptr); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index ac06e561795..b3b693c630c 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -124,6 +124,12 @@ public: /* Use various shadow tricks, such as shadow catcher. */ bool use_shadow_tricks; + /* Per-uber shader usage flags. */ + bool use_principled; + + /* Denoising features. */ + bool use_denoising; + DeviceRequestedFeatures() { /* TODO(sergey): Find more meaningful defaults. */ @@ -141,6 +147,8 @@ public: use_patch_evaluation = false; use_transparent = false; use_shadow_tricks = false; + use_principled = false; + use_denoising = false; } bool modified(const DeviceRequestedFeatures& requested_features) @@ -158,7 +166,9 @@ public: use_integrator_branched == requested_features.use_integrator_branched && use_patch_evaluation == requested_features.use_patch_evaluation && use_transparent == requested_features.use_transparent && - use_shadow_tricks == requested_features.use_shadow_tricks); + use_shadow_tricks == requested_features.use_shadow_tricks && + use_principled == requested_features.use_principled && + use_denoising == requested_features.use_denoising); } /* Convert the requested features structure to a build options, @@ -205,6 +215,12 @@ public: if(!use_shadow_tricks) { build_options += " -D__NO_SHADOW_TRICKS__"; } + if(!use_principled) { + build_options += " -D__NO_PRINCIPLED__"; + } + if(!use_denoising) { + build_options += " -D__NO_DENOISING__"; + } return build_options; } }; @@ -220,6 +236,7 @@ struct DeviceDrawParams { }; class Device { + friend class device_sub_ptr; protected: Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {} @@ -229,6 +246,14 @@ protected: /* used for real time display */ unsigned int vertex_buffer; + virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/) + { + /* Only required for devices that implement denoising. */ + assert(false); + return (device_ptr) 0; + } + virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {}; + public: virtual ~Device(); @@ -257,6 +282,8 @@ public: virtual void mem_zero(device_memory& mem) = 0; virtual void mem_free(device_memory& mem) = 0; + virtual int mem_address_alignment() { return 16; } + /* constant memory */ virtual void const_copy_to(const char *name, void *host, size_t size) = 0; @@ -304,6 +331,8 @@ public: /* multi device */ virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {} virtual int device_number(Device * /*sub_device*/) { return 0; } + virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} + virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} /* static */ static Device *create(DeviceInfo& info, Stats &stats, bool background = true); diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 3c481bb2b39..18112437b45 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -25,6 +25,7 @@ #endif #include "device/device.h" +#include "device/device_denoising.h" #include "device/device_intern.h" #include "device/device_split_kernel.h" @@ -34,6 +35,8 @@ #include "kernel/split/kernel_split_data.h" #include "kernel/kernel_globals.h" +#include "kernel/filter/filter.h" + #include "kernel/osl/osl_shader.h" #include "kernel/osl/osl_globals.h" @@ -53,91 +56,108 @@ CCL_NAMESPACE_BEGIN class CPUDevice; -class CPUSplitKernel : public DeviceSplitKernel { - CPUDevice *device; -public: - explicit CPUSplitKernel(CPUDevice *device); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data_, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs); +/* Has to be outside of the class to be shared across template instantiations. */ +static const char *logged_architecture = ""; - virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); -}; - -class CPUDevice : public Device -{ - static unordered_map<string, void*> kernel_functions; - - static void register_kernel_function(const char* name, void* func) +template<typename F> +class KernelFunctions { +public: + KernelFunctions() { - kernel_functions[name] = func; + kernel = (F)NULL; } - static const char* get_arch_name() + KernelFunctions(F kernel_default, + F kernel_sse2, + F kernel_sse3, + F kernel_sse41, + F kernel_avx, + F kernel_avx2) { + const char *architecture_name = "default"; + kernel = kernel_default; + + /* Silence potential warnings about unused variables + * when compiling without some architectures. */ + (void)kernel_sse2; + (void)kernel_sse3; + (void)kernel_sse41; + (void)kernel_avx; + (void)kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) { - return "cpu_avx2"; + architecture_name = "AVX2"; + kernel = kernel_avx2; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { - return "cpu_avx"; + architecture_name = "AVX"; + kernel = kernel_avx; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { - return "cpu_sse41"; + architecture_name = "SSE4.1"; + kernel = kernel_sse41; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { - return "cpu_sse3"; + architecture_name = "SSE3"; + kernel = kernel_sse3; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { - return "cpu_sse2"; + architecture_name = "SSE2"; + kernel = kernel_sse2; } - else #endif - { - return "cpu"; + + if(strstr(architecture_name, logged_architecture) != 0) { + VLOG(1) << "Will be using " << architecture_name << " kernels."; + logged_architecture = architecture_name; } } - template<typename F> - static F get_kernel_function(string name) - { - name = string("kernel_") + get_arch_name() + "_" + name; - - unordered_map<string, void*>::iterator it = kernel_functions.find(name); + inline F operator()() const { + assert(kernel); + return kernel; + } +protected: + F kernel; +}; - if(it == kernel_functions.end()) { - assert(!"kernel function not found"); - return NULL; - } +class CPUSplitKernel : public DeviceSplitKernel { + CPUDevice *device; +public: + explicit CPUSplitKernel(CPUDevice *device); - return (F)it->second; - } + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); - friend class CPUSplitKernel; + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); +}; +class CPUDevice : public Device +{ public: TaskPool task_pool; KernelGlobals kernel_globals; @@ -149,77 +169,92 @@ public: bool use_split_kernel; DeviceRequestedFeatures requested_features; - + + KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; + KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel; + + KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel; + KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; + + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; + KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; + + KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + + KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, + ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int, + ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel; + unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels; + +#define KERNEL_FUNCTIONS(name) \ + KERNEL_NAME_EVAL(cpu, name), \ + KERNEL_NAME_EVAL(cpu_sse2, name), \ + KERNEL_NAME_EVAL(cpu_sse3, name), \ + KERNEL_NAME_EVAL(cpu_sse41, name), \ + KERNEL_NAME_EVAL(cpu_avx, name), \ + KERNEL_NAME_EVAL(cpu_avx2, name) + CPUDevice(DeviceInfo& info, Stats &stats, bool background) - : Device(info, stats, background) + : Device(info, stats, background), +#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name)) + REGISTER_KERNEL(path_trace), + REGISTER_KERNEL(convert_to_half_float), + REGISTER_KERNEL(convert_to_byte), + REGISTER_KERNEL(shader), + REGISTER_KERNEL(filter_divide_shadow), + REGISTER_KERNEL(filter_get_feature), + REGISTER_KERNEL(filter_detect_outliers), + REGISTER_KERNEL(filter_combine_halves), + REGISTER_KERNEL(filter_nlm_calc_difference), + REGISTER_KERNEL(filter_nlm_blur), + REGISTER_KERNEL(filter_nlm_calc_weight), + REGISTER_KERNEL(filter_nlm_update_output), + REGISTER_KERNEL(filter_nlm_normalize), + REGISTER_KERNEL(filter_construct_transform), + REGISTER_KERNEL(filter_nlm_construct_gramian), + REGISTER_KERNEL(filter_finalize), + REGISTER_KERNEL(data_init) +#undef REGISTER_KERNEL { #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif - - /* do now to avoid thread issues */ - system_cpu_support_sse2(); - system_cpu_support_sse3(); - system_cpu_support_sse41(); - system_cpu_support_avx(); - system_cpu_support_avx2(); - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - VLOG(1) << "Will be using AVX2 kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - VLOG(1) << "Will be using AVX kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - VLOG(1) << "Will be using SSE4.1 kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - VLOG(1) << "Will be using SSE3kernels."; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - VLOG(1) << "Will be using SSE2 kernels."; - } - else -#endif - { - VLOG(1) << "Will be using regular kernels."; - } - use_split_kernel = DebugFlags().cpu.split_kernel; if(use_split_kernel) { VLOG(1) << "Will be using split kernel."; } - kernel_cpu_register_functions(register_kernel_function); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - kernel_cpu_sse2_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - kernel_cpu_sse3_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - kernel_cpu_sse41_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - kernel_cpu_avx_register_functions(register_kernel_function); -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - kernel_cpu_avx2_register_functions(register_kernel_function); -#endif +#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name)) + REGISTER_SPLIT_KERNEL(path_init); + REGISTER_SPLIT_KERNEL(scene_intersect); + REGISTER_SPLIT_KERNEL(lamp_emission); + REGISTER_SPLIT_KERNEL(do_volume); + REGISTER_SPLIT_KERNEL(queue_enqueue); + REGISTER_SPLIT_KERNEL(indirect_background); + REGISTER_SPLIT_KERNEL(shader_setup); + REGISTER_SPLIT_KERNEL(shader_sort); + REGISTER_SPLIT_KERNEL(shader_eval); + REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); + REGISTER_SPLIT_KERNEL(subsurface_scatter); + REGISTER_SPLIT_KERNEL(direct_lighting); + REGISTER_SPLIT_KERNEL(shadow_blocked_ao); + REGISTER_SPLIT_KERNEL(shadow_blocked_dl); + REGISTER_SPLIT_KERNEL(enqueue_inactive); + REGISTER_SPLIT_KERNEL(next_iteration_setup); + REGISTER_SPLIT_KERNEL(indirect_subsurface); + REGISTER_SPLIT_KERNEL(buffer_update); +#undef REGISTER_SPLIT_KERNEL +#undef KERNEL_FUNCTIONS } ~CPUDevice() @@ -273,13 +308,17 @@ public: if(!mem.data_pointer) { free((void*)mem.device_pointer); } - mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { kernel_const_copy(&kernel_globals, name, host, size); @@ -326,13 +365,8 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { - if(!use_split_kernel) { - thread_path_trace(*task); - } - else { - thread_path_trace_split(*task); - } + if(task->type == DeviceTask::RENDER) { + thread_render(*task); } else if(task->type == DeviceTask::FILM_CONVERT) thread_film_convert(*task); @@ -349,117 +383,335 @@ public: } }; - void thread_path_trace(DeviceTask& task) + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) { - if(task_pool.canceled()) { - if(task.need_finish_queue == false) - return; + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); + + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; } - KernelGlobals kg = thread_kernel_globals_init(); - RenderTile tile; + return true; + } - void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + int4 rect = task->rect; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int w = align_up(rect.z-rect.x, 4); + int h = rect.w-rect.y; + + float *blurDifference = (float*) task->nlm_state.temporary_1_ptr; + float *difference = (float*) task->nlm_state.temporary_2_ptr; + float *weightAccum = (float*) task->nlm_state.temporary_3_ptr; + + memset(weightAccum, 0, sizeof(float)*w*h); + memset((float*) out_ptr, 0, sizeof(float)*w*h); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) variance_ptr, + difference, + local_rect, + w, 0, + a, k_2); + + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + + filter_nlm_update_output_kernel()(dx, dy, + blurDifference, + (float*) image_ptr, + (float*) out_ptr, + weightAccum, + local_rect, + w, f); + } + + int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; + filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - path_trace_kernel = kernel_cpu_avx2_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - path_trace_kernel = kernel_cpu_avx_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - path_trace_kernel = kernel_cpu_sse41_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - path_trace_kernel = kernel_cpu_sse3_path_trace; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - path_trace_kernel = kernel_cpu_sse2_path_trace; + return true; + } + + bool denoising_construct_transform(DenoisingTask *task) + { + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + x + task->filter_area.x, + y + task->filter_area.y, + y*task->filter_area.z + x, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + &task->rect.x, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + } } - else -#endif - { - path_trace_kernel = kernel_cpu_path_trace; + return true; + } + + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + float *difference = (float*) task->reconstruction_state.temporary_1_ptr; + float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) color_ptr, + (float*) color_variance_ptr, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + 1.0f, + task->nlm_k_2); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_construct_gramian_kernel()(dx, dy, + blurDifference, + (float*) task->buffer.mem.device_pointer, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + local_rect, + &task->reconstruction_state.filter_rect.x, + task->buffer.w, + task->buffer.h, + 4, + task->buffer.pass_stride); + } + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_finalize_kernel()(x, + y, + y*task->filter_area.z + x, + task->buffer.w, + task->buffer.h, + (float*) output_ptr, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + &task->reconstruction_state.buffer_params.x, + task->render_buffer.samples); + } } + return true; + } - while(task.acquire_tile(this, tile)) { - float *render_buffer = (float*)tile.buffer; - uint *rng_state = (uint*)tile.rng_state; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task.get_cancel() || task_pool.canceled()) { - if(task.need_finish_queue == false) - break; - } + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask * /*task*/) + { + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + filter_combine_halves_kernel()(x, y, + (float*) mean_ptr, + (float*) variance_ptr, + (float*) a_ptr, + (float*) b_ptr, + &rect.x, + r); + } + } + return true; + } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - path_trace_kernel(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); - } - } + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_divide_shadow_kernel()(task->render_buffer.samples, + task->tiles, + x, y, + (float*) a_ptr, + (float*) b_ptr, + (float*) sample_variance_ptr, + (float*) sv_variance_ptr, + (float*) buffer_variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + use_split_kernel); + } + } + return true; + } - tile.sample = sample + 1; + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_get_feature_kernel()(task->render_buffer.samples, + task->tiles, + mean_offset, + variance_offset, + x, y, + (float*) mean_ptr, + (float*) variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + use_split_kernel); + } + } + return true; + } - task.update_progress(&tile, tile.w*tile.h); + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_detect_outliers_kernel()(x, y, + (float*) image_ptr, + (float*) variance_ptr, + (float*) depth_ptr, + (float*) output_ptr, + &task->rect.x, + task->buffer.pass_stride); } + } + return true; + } - task.release_tile(tile); + void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) + { + float *render_buffer = (float*)tile.buffer; + uint *rng_state = (uint*)tile.rng_state; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - if(task_pool.canceled()) { + for(int sample = start_sample; sample < end_sample; sample++) { + if(task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) break; } + + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + path_trace_kernel()(kg, render_buffer, rng_state, + sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(&tile, tile.w*tile.h); } + } + + void denoise(DeviceTask &task, RenderTile &tile) + { + tile.sample = tile.start_sample + tile.num_samples; + + DenoisingTask denoising(this); - thread_kernel_globals_free(&kg); + denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); + denoising.render_buffer.samples = tile.sample; + + RenderTile rtiles[9]; + rtiles[4] = tile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + + task.update_progress(&tile, tile.w*tile.h); } - void thread_path_trace_split(DeviceTask& task) + void thread_render(DeviceTask& task) { if(task_pool.canceled()) { if(task.need_finish_queue == false) return; } - RenderTile tile; - - CPUSplitKernel split_kernel(this); - /* allocate buffer for kernel globals */ - device_memory kgbuffer; - kgbuffer.resize(sizeof(KernelGlobals)); + device_only_memory<KernelGlobals> kgbuffer; + kgbuffer.resize(1); mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE); - KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer; - *kg = thread_kernel_globals_init(); + KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init()); - requested_features.max_closure = MAX_CLOSURE; - if(!split_kernel.load_kernels(requested_features)) { - thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); - mem_free(kgbuffer); + CPUSplitKernel *split_kernel = NULL; + if(use_split_kernel) { + split_kernel = new CPUSplitKernel(this); + requested_features.max_closure = MAX_CLOSURE; + if(!split_kernel->load_kernels(requested_features)) { + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); - return; + delete split_kernel; + return; + } } + RenderTile tile; while(task.acquire_tile(this, tile)) { - device_memory data; - split_kernel.path_trace(&task, tile, kgbuffer, data); + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel) { + device_memory data; + split_kernel->path_trace(&task, tile, kgbuffer, data); + } + else { + path_trace(task, tile, kg); + } + } + else if(tile.task == RenderTile::DENOISE) { + denoise(task, tile); + } task.release_tile(tile); @@ -470,7 +722,9 @@ public: } thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + kg->~KernelGlobals(); mem_free(kgbuffer); + delete split_kernel; } void thread_film_convert(DeviceTask& task) @@ -478,86 +732,16 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { - void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float; - } - else -#endif - { - convert_to_half_float_kernel = kernel_cpu_convert_to_half_float; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } else { - void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte; - } - else -#endif - { - convert_to_byte_kernel = kernel_cpu_convert_to_byte; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } } @@ -569,53 +753,17 @@ public: #ifdef WITH_OSL OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif - void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int); - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - shader_kernel = kernel_cpu_avx2_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - shader_kernel = kernel_cpu_avx_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - shader_kernel = kernel_cpu_sse41_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - shader_kernel = kernel_cpu_sse3_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - shader_kernel = kernel_cpu_sse2_shader; - } - else -#endif - { - shader_kernel = kernel_cpu_shader; - } - for(int sample = 0; sample < task.num_samples; sample++) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) - shader_kernel(&kg, - (uint4*)task.shader_input, - (float4*)task.shader_output, - (float*)task.shader_output_luma, - task.shader_eval_type, - task.shader_filter, - x, - task.offset, - sample); + shader_kernel()(&kg, + (uint4*)task.shader_input, + (float4*)task.shader_output, + (float*)task.shader_output_luma, + task.shader_eval_type, + task.shader_filter, + x, + task.offset, + sample); if(task.get_cancel() || task_pool.canceled()) break; @@ -752,58 +900,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, device_memory& use_queues_flags, device_memory& work_pool_wgs) { - typedef void(*data_init_t)(KernelGlobals *kg, - ccl_constant KernelData *data, - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - ccl_global uint *rng_state, - int start_sample, - int end_sample, - int sx, int sy, int sw, int sh, int offset, int stride, - ccl_global int *Queue_index, - int queuesize, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, - ccl_global float *buffer); - - data_init_t data_init; - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - data_init = kernel_cpu_avx2_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - data_init = kernel_cpu_avx_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - data_init = kernel_cpu_sse41_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - data_init = kernel_cpu_sse3_data_init; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - data_init = kernel_cpu_sse2_data_init; - } - else -#endif - { - data_init = kernel_cpu_data_init; - } - KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); @@ -811,37 +907,38 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, for(int x = 0; x < dim.global_size[0]; x++) { kg->global_id = make_int2(x, y); - data_init((KernelGlobals*)kernel_globals.device_pointer, - (KernelData*)data.device_pointer, - (void*)split_data.device_pointer, - num_global_elements, - (char*)ray_state.device_pointer, - (uint*)rtile.rng_state, - rtile.start_sample, - rtile.start_sample + rtile.num_samples, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - (int*)queue_index.device_pointer, - dim.global_size[0] * dim.global_size[1], - (char*)use_queues_flags.device_pointer, - (uint*)work_pool_wgs.device_pointer, - rtile.num_samples, - (float*)rtile.buffer); + device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer, + (KernelData*)data.device_pointer, + (void*)split_data.device_pointer, + num_global_elements, + (char*)ray_state.device_pointer, + (uint*)rtile.rng_state, + rtile.start_sample, + rtile.start_sample + rtile.num_samples, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + (int*)queue_index.device_pointer, + dim.global_size[0] * dim.global_size[1], + (char*)use_queues_flags.device_pointer, + (uint*)work_pool_wgs.device_pointer, + rtile.num_samples, + (float*)rtile.buffer); } } return true; } -SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) +SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&) { CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); - kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name); + kernel->func = device->split_kernels[kernel_name](); if(!kernel->func) { delete kernel; return NULL; @@ -865,8 +962,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device return split_data_buffer_size(kg, num_threads); } -unordered_map<string, void*> CPUDevice::kernel_functions; - Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) { return new CPUDevice(info, stats, background); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index ef283c9d455..3a29538aa13 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -21,11 +21,14 @@ #include <string.h> #include "device/device.h" +#include "device/device_denoising.h" #include "device/device_intern.h" #include "device/device_split_kernel.h" #include "render/buffers.h" +#include "kernel/filter/filter_defines.h" + #ifdef WITH_CUDA_DYNLOAD # include "cuew.h" #else @@ -102,7 +105,8 @@ public: device_memory& use_queues_flag, device_memory& work_pool_wgs); - virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&); virtual int2 split_kernel_local_size(); virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); }; @@ -113,12 +117,13 @@ public: DedicatedTaskPool task_pool; CUdevice cuDevice; CUcontext cuContext; - CUmodule cuModule; + CUmodule cuModule, cuFilterModule; map<device_ptr, bool> tex_interp_map; map<device_ptr, uint> tex_bindless_map; int cuDevId; int cuDevArchitecture; bool first_error; + CUDASplitKernel *split_kernel; struct PixelMem { GLuint cuPBO; @@ -169,7 +174,7 @@ public: CUresult result = stmt; \ \ if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -221,6 +226,11 @@ public: cuDevice = 0; cuContext = 0; + cuModule = 0; + cuFilterModule = 0; + + split_kernel = NULL; + need_bindless_mapping = false; /* intialize */ @@ -260,6 +270,8 @@ public: { task_pool.stop(); + delete split_kernel; + if(info.has_bindless_textures) { tex_free(bindless_mapping); } @@ -296,7 +308,8 @@ public: * kernel sources md5 and only depends on compiler or compilation settings. */ string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features, bool split=false) + const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { const int cuda_version = cuewCompilerVersion(); const int machine = system_cpu_bits(); @@ -311,7 +324,7 @@ public: machine, cuda_version, include_path.c_str()); - if(use_adaptive_compilation()) { + if(!filter && use_adaptive_compilation()) { cflags += " " + requested_features.get_build_options(); } const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); @@ -359,8 +372,22 @@ public: return true; } - string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false) + string compile_kernel(const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { + const char *name, *source; + if(filter) { + name = "filter"; + source = "filter.cu"; + } + else if(split) { + name = "kernel_split"; + source = "kernel_split.cu"; + } + else { + name = "kernel"; + source = "kernel.cu"; + } /* Compute cubin name. */ int major, minor; cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); @@ -368,9 +395,8 @@ public: /* Attempt to use kernel provided with Blender. */ if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin" - : "lib/kernel_sm_%d%d.cubin", - major, minor)); + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", + name, major, minor)); VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; if(path_exists(cubin)) { VLOG(1) << "Using precompiled kernel."; @@ -379,7 +405,7 @@ public: } const string common_cflags = - compile_kernel_get_common_cflags(requested_features, split); + compile_kernel_get_common_cflags(requested_features, filter, split); /* Try to use locally compiled kernel. */ const string source_path = path_get("source"); @@ -390,9 +416,8 @@ public: */ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin" - : "cycles_kernel_sm%d%d_%s.cubin", - major, minor, + const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin", + name, major, minor, cubin_md5.c_str()); const string cubin = path_cache_get(path_join("kernels", cubin_file)); VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; @@ -427,7 +452,7 @@ public: const string kernel = path_join( path_join(source_path, "kernel"), path_join("kernels", - path_join("cuda", split ? "kernel_split.cu" : "kernel.cu"))); + path_join("cuda", source))); double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -466,6 +491,16 @@ public: bool load_kernels(const DeviceRequestedFeatures& requested_features) { + /* TODO(sergey): Support kernels re-load for CUDA devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in cuCtxSynchronize. + */ + if(cuFilterModule && cuModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + /* check if cuda init succeeded */ if(cuContext == 0) return false; @@ -475,11 +510,14 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(requested_features, use_split_kernel()); - + string cubin = compile_kernel(requested_features, false, use_split_kernel()); if(cubin == "") return false; + string filter_cubin = compile_kernel(requested_features, true, false); + if(filter_cubin == "") + return false; + /* open module */ cuda_push_context(); @@ -494,6 +532,14 @@ public: if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); + if(path_read_text(filter_cubin, cubin_data)) + result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if(cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); + cuda_pop_context(); return (result == CUDA_SUCCESS); @@ -576,6 +622,11 @@ public: } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { CUdeviceptr mem; @@ -876,6 +927,393 @@ public: } } + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) + { + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); + + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; + } + + mem_copy_to(task->tiles_mem); + + return !have_error(); + } + +#define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1)/threads; \ + int yblocks = ((h) + threads - 1)/threads; + +#define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads, threads, 1, \ + 0, 0, args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + int4 rect = task->rect; + int w = align_up(rect.z-rect.x, 4); + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + CUdeviceptr difference = task->nlm_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr; + CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h)); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y); + + int dx, dy; + int4 local_rect; + int channel_offset = 0; + void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; + void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f}; + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + dy = i / (2*r+1) - r; + dx = i % (2*r+1) - r; + local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args); + } + + local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y); + void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w}; + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_construct_transform(DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, + task->storage.w, + task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cuda_push_context(); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + void *calc_difference_args[] = {&dx, &dy, + &color_ptr, + &color_variance_ptr, + &difference, + &local_rect, + &task->buffer.w, + &task->buffer.pass_stride, + &a, + &task->nlm_k_2}; + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + + void *blur_args[] = {&difference, + &blurDifference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *calc_weight_args[] = {&blurDifference, + &difference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + + /* Reuse previous arguments. */ + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *construct_gramian_args[] = {&dx, &dy, + &blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &local_rect, + &task->reconstruction_state.filter_rect, + &task->buffer.w, + &task->buffer.h, + &f, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args); + } + + void *finalize_args[] = {&task->buffer.w, + &task->buffer.h, + &output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&mean_ptr, + &variance_ptr, + &a_ptr, + &b_ptr, + &rect, + &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + bool use_split_variance = use_split_kernel(); + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset, + &use_split_variance}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterGetFeature; + cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + bool use_split_variance = use_split_kernel(); + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset, + &use_split_variance}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterDetectOutliers; + cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&image_ptr, + &variance_ptr, + &depth_ptr, + &output_ptr, + &task->rect, + &task->buffer.pass_stride}; + + CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + void denoise(RenderTile &rtile, const DeviceTask &task) + { + DenoisingTask denoising(this); + + denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + } + void path_trace(RenderTile& rtile, int sample, bool branched) { if(have_error()) @@ -1300,7 +1738,7 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { + if(task->type == DeviceTask::RENDER) { RenderTile tile; bool branched = task->integrator_branched; @@ -1308,47 +1746,56 @@ public: /* Upload Bindless Mapping */ load_bindless_mapping(); - if(!use_split_kernel()) { - /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + DeviceRequestedFeatures requested_features; + if(use_split_kernel()) { + if(!use_adaptive_compilation()) { + requested_features.max_closure = 64; + } + + if(split_kernel == NULL) { + split_kernel = new CUDASplitKernel(this); + split_kernel->load_kernels(requested_features); + } + } - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } + /* keep rendering tiles until done */ + while(task->acquire_tile(this, tile)) { + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel()) { + device_memory void_buffer; + split_kernel->path_trace(task, tile, void_buffer, void_buffer); + } + else { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - path_trace(tile, sample, branched); + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } - tile.sample = sample + 1; + path_trace(tile, sample, branched); - task->update_progress(&tile, tile.w*tile.h); - } + tile.sample = sample + 1; - task->release_tile(tile); - } - } - else { - DeviceRequestedFeatures requested_features; - if(!use_adaptive_compilation()) { - requested_features.max_closure = 64; + task->update_progress(&tile, tile.w*tile.h); + } + } } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; - CUDASplitKernel split_kernel(this); - split_kernel.load_kernels(requested_features); + denoise(tile, *task); - while(task->acquire_tile(this, tile)) { - device_memory void_buffer; - split_kernel.path_trace(task, tile, void_buffer, void_buffer); + task->update_progress(&tile, tile.w*tile.h); + } - task->release_tile(tile); + task->release_tile(tile); - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; } } } @@ -1591,7 +2038,8 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim return !device->have_error(); } -SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) +SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&) { CUfunction func; @@ -1627,7 +2075,8 @@ int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& << string_human_readable_size(free) << ")."; size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); - int2 global_size = make_int2(round_down((int)sqrt(num_elements), 32), (int)sqrt(num_elements)); + size_t side = round_down((int)sqrt(num_elements), 32); + int2 global_size = make_int2(side, round_down(num_elements / side, 16)); VLOG(1) << "Global size: " << global_size << "."; return global_size; } diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp new file mode 100644 index 00000000000..619cc1d171e --- /dev/null +++ b/intern/cycles/device/device_denoising.cpp @@ -0,0 +1,232 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_denoising.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +void DenoisingTask::init_from_devicetask(const DeviceTask &task) +{ + radius = task.denoising_radius; + nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength)); + if(task.denoising_relative_pca) { + pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength)); + } + else { + pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); + } + + render_buffer.pass_stride = task.pass_stride; + render_buffer.denoising_data_offset = task.pass_denoising_data; + render_buffer.denoising_clean_offset = task.pass_denoising_clean; + + /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ + rect = make_int4(max(tiles->x[0], filter_area.x - radius), + max(tiles->y[0], filter_area.y - radius), + min(tiles->x[3], filter_area.x + filter_area.z + radius), + min(tiles->y[3], filter_area.y + filter_area.w + radius)); +} + +void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles) +{ + tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int)); + + device_ptr buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = rtiles[i].buffer; + tiles->offsets[i] = rtiles[i].offset; + tiles->strides[i] = rtiles[i].stride; + } + tiles->x[0] = rtiles[3].x; + tiles->x[1] = rtiles[4].x; + tiles->x[2] = rtiles[5].x; + tiles->x[3] = rtiles[5].x + rtiles[5].w; + tiles->y[0] = rtiles[1].y; + tiles->y[1] = rtiles[4].y; + tiles->y[2] = rtiles[7].y; + tiles->y[3] = rtiles[7].y + rtiles[7].h; + + render_buffer.offset = rtiles[4].offset; + render_buffer.stride = rtiles[4].stride; + render_buffer.ptr = rtiles[4].buffer; + + functions.set_tiles(buffers); +} + +bool DenoisingTask::run_denoising() +{ + /* Allocate denoising buffer. */ + buffer.passes = 14; + buffer.w = align_up(rect.z - rect.x, 4); + buffer.h = rect.w - rect.y; + buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float))); + buffer.mem.resize(buffer.pass_stride * buffer.passes); + device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE); + + device_ptr null_ptr = (device_ptr) 0; + + /* Prefilter shadow feature. */ + { + device_sub_ptr unfiltered_a (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr unfiltered_b (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr sample_var (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr buffer_var (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr filtered_var (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ + functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); + + /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ + nlm_state.set_parameters(6, 3, 4.0f, 1.0f); + functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); + + /* Reuse memory, the previous data isn't needed anymore. */ + device_ptr filtered_a = *buffer_var, + filtered_b = *sample_var; + /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ + nlm_state.set_parameters(5, 3, 1.0f, 0.25f); + functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); + functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); + + device_ptr residual_var = *sample_var_var; + /* Estimate the residual variance between the two filtered halves. */ + functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); + + device_ptr final_a = *unfiltered_a, + final_b = *unfiltered_b; + /* Use the residual variance for a second filter pass. */ + nlm_state.set_parameters(4, 2, 1.0f, 0.5f); + functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); + functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); + + /* Combine the two double-filtered halves to a final shadow feature. */ + device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); + } + + /* Prefilter general features. */ + { + device_sub_ptr unfiltered (device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr variance (device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + int mean_from[] = { 0, 1, 2, 12, 6, 7, 8 }; + int variance_from[] = { 3, 4, 5, 13, 9, 10, 11}; + int pass_to[] = { 1, 2, 3, 0, 5, 6, 7}; + for(int pass = 0; pass < 7; pass++) { + device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + /* Get the unfiltered pass and its variance from the RenderBuffers. */ + functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance); + /* Smooth the pass and store the result in the denoising buffers. */ + nlm_state.set_parameters(2, 2, 1.0f, 0.25f); + functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); + } + } + + /* Copy color passes. */ + { + int mean_from[] = {20, 21, 22}; + int variance_from[] = {23, 24, 25}; + int mean_to[] = { 8, 9, 10}; + int variance_to[] = {11, 12, 13}; + int num_color_passes = 3; + + device_only_memory<float> temp_color; + temp_color.resize(3*buffer.pass_stride); + device->mem_alloc("Denoising temporary color", temp_color, MEM_READ_WRITE); + + for(int pass = 0; pass < num_color_passes; pass++) { + device_sub_ptr color_pass(device, temp_color, pass*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE); + functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass); + } + + { + device_sub_ptr depth_pass (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr color_var_pass(device, buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr output_pass (device, buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE); + functions.detect_outliers(temp_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); + } + + device->mem_free(temp_color); + } + + storage.w = filter_area.z; + storage.h = filter_area.w; + storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE); + storage.rank.resize(storage.w*storage.h); + device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE); + device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE); + + functions.construct_transform(); + + device_only_memory<float> temporary_1; + device_only_memory<float> temporary_2; + temporary_1.resize(buffer.w*buffer.h); + temporary_2.resize(buffer.w*buffer.h); + device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE); + device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE); + reconstruction_state.temporary_1_ptr = temporary_1.device_pointer; + reconstruction_state.temporary_2_ptr = temporary_2.device_pointer; + + storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE); + storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE); + device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE); + device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE); + + reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); + int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x; + reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset, + render_buffer.stride, + render_buffer.pass_stride, + render_buffer.denoising_clean_offset); + reconstruction_state.source_w = rect.z-rect.x; + reconstruction_state.source_h = rect.w-rect.y; + + { + device_sub_ptr color_ptr (device, buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE); + device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE); + functions.reconstruct(*color_ptr, *color_var_ptr, render_buffer.ptr); + } + + device->mem_free(storage.XtWX); + device->mem_free(storage.XtWY); + device->mem_free(storage.transform); + device->mem_free(storage.rank); + device->mem_free(temporary_1); + device->mem_free(temporary_2); + device->mem_free(buffer.mem); + device->mem_free(tiles_mem); + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h new file mode 100644 index 00000000000..def7b72f67d --- /dev/null +++ b/intern/cycles/device/device_denoising.h @@ -0,0 +1,148 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __DEVICE_DENOISING_H__ +#define __DEVICE_DENOISING_H__ + +#include "device/device.h" + +#include "render/buffers.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +class DenoisingTask { +public: + /* Parameters of the denoising algorithm. */ + int radius; + float nlm_k_2; + float pca_threshold; + + /* Pointer and parameters of the RenderBuffers. */ + struct RenderBuffers { + int denoising_data_offset; + int denoising_clean_offset; + int pass_stride; + int offset; + int stride; + device_ptr ptr; + int samples; + } render_buffer; + + TilesInfo *tiles; + device_vector<int> tiles_mem; + void tiles_from_rendertiles(RenderTile *rtiles); + + int4 rect; + int4 filter_area; + + struct DeviceFunctions { + function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */ + device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ + device_ptr variance_ptr, /* Contains the variance of the guide image. */ + device_ptr out_ptr /* The filtered output is written into this image. */ + )> non_local_means; + function<bool(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr + )> reconstruct; + function<bool()> construct_transform; + + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect + )> combine_halves; + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr + )> divide_shadow; + function<bool(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr + )> get_feature; + function<bool(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr + )> detect_outliers; + function<bool(device_ptr*)> set_tiles; + } functions; + + /* Stores state of the current Reconstruction operation, + * which is accessed by the device in order to perform the operation. */ + struct ReconstructionState { + device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */ + device_ptr temporary_2_ptr; + + int4 filter_rect; + int4 buffer_params; + + int source_w; + int source_h; + } reconstruction_state; + + /* Stores state of the current NLM operation, + * which is accessed by the device in order to perform the operation. */ + struct NLMState { + device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */ + device_ptr temporary_2_ptr; + device_ptr temporary_3_ptr; + + int r; /* Search radius of the filter. */ + int f; /* Patch size of the filter. */ + float a; /* Variance compensation factor in the MSE estimation. */ + float k_2; /* Squared value of the k parameter of the filter. */ + + void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; } + } nlm_state; + + struct Storage { + device_only_memory<float> transform; + device_only_memory<int> rank; + device_only_memory<float> XtWX; + device_only_memory<float3> XtWY; + int w; + int h; + } storage; + + DenoisingTask(Device *device) : device(device) {} + + void init_from_devicetask(const DeviceTask &task); + + bool run_denoising(); + + struct DenoiseBuffers { + int pass_stride; + int passes; + int w; + int h; + device_only_memory<float> mem; + } buffer; + +protected: + Device *device; +}; + +CCL_NAMESPACE_END + +#endif /* __DEVICE_DENOISING_H__ */ diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 4b10514a9d2..b63dd00068b 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -35,6 +35,8 @@ CCL_NAMESPACE_BEGIN +class Device; + enum MemoryType { MEM_READ_ONLY, MEM_WRITE_ONLY, @@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> { template<> struct device_type_traits<float3> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 3; + static const int num_elements = 4; }; template<> struct device_type_traits<float4> { @@ -173,6 +175,9 @@ class device_memory { public: size_t memory_size() { return data_size*data_elements*datatype_size(data_type); } + size_t memory_elements_size(int elements) { + return elements*data_elements*datatype_size(data_type); + } /* data information */ DataType data_type; @@ -213,6 +218,22 @@ protected: device_memory& operator = (const device_memory&); }; +template<typename T> +class device_only_memory : public device_memory +{ +public: + device_only_memory() + { + data_type = device_type_traits<T>::data_type; + data_elements = max(device_type_traits<T>::num_elements, 1); + } + + void resize(size_t num) + { + device_memory::resize(num*sizeof(T)); + } +}; + /* Device Vector */ template<typename T> class device_vector : public device_memory @@ -299,6 +320,27 @@ private: array<T> data; }; +/* A device_sub_ptr is a pointer into another existing memory. + * Therefore, it is not allocated separately, but just created from the already allocated base memory. + * It is freed automatically when it goes out of scope, which should happen before the base memory is freed. + * Note that some devices require the offset and size of the sub_ptr to be properly aligned. */ +class device_sub_ptr +{ +public: + device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type); + ~device_sub_ptr(); + /* No copying. */ + device_sub_ptr& operator = (const device_sub_ptr&); + + device_ptr operator*() const + { + return ptr; + } +protected: + Device *device; + device_ptr ptr; +}; + CCL_NAMESPACE_END #endif /* __DEVICE_MEMORY_H__ */ diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 624260a81c8..bc505b676fc 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -299,6 +299,60 @@ public: return -1; } + void map_neighbor_tiles(Device *sub_device, RenderTile *tiles) + { + for(int i = 0; i < 9; i++) { + if(!tiles[i].buffers) { + continue; + } + /* If the tile was rendered on another device, copy its memory to + * to the current device now, for the duration of the denoising task. + * Note that this temporarily modifies the RenderBuffers and calls + * the device, so this function is not thread safe. */ + if(tiles[i].buffers->device != sub_device) { + device_vector<float> &mem = tiles[i].buffers->buffer; + + tiles[i].buffers->copy_from_device(); + device_ptr original_ptr = mem.device_pointer; + mem.device_pointer = 0; + sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE); + sub_device->mem_copy_to(mem); + tiles[i].buffer = mem.device_pointer; + mem.device_pointer = original_ptr; + } + } + } + + void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles) + { + for(int i = 0; i < 9; i++) { + if(!tiles[i].buffers) { + continue; + } + if(tiles[i].buffers->device != sub_device) { + device_vector<float> &mem = tiles[i].buffers->buffer; + + device_ptr original_ptr = mem.device_pointer; + mem.device_pointer = tiles[i].buffer; + + /* Copy denoised tile to the host. */ + if(i == 4) { + tiles[i].buffers->copy_from_device(sub_device); + } + + size_t mem_size = mem.device_size; + sub_device->mem_free(mem); + mem.device_pointer = original_ptr; + mem.device_size = mem_size; + + /* Copy denoised tile to the original device. */ + if(i == 4) { + tiles[i].buffers->device->mem_copy_to(mem); + } + } + } + } + int get_split_task_count(DeviceTask& task) { int total_tasks = 0; diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index edd2047debc..681b8214b03 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -130,10 +130,22 @@ string device_opencl_capabilities(void) opencl_assert(func(id, what, sizeof(data), &data, NULL)); \ result += string_printf("%s: %s\n", name, data); \ } while(false) +#define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \ + do { \ + char data[1024] = "\0"; \ + size_t length = 0; \ + if(func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \ + if(length != 0 && data[0] != '\0') { \ + result += string_printf("%s: %s\n", name, data); \ + } \ + } \ + } while(false) #define APPEND_PLATFORM_STRING_INFO(id, name, what) \ APPEND_STRING_INFO(clGetPlatformInfo, id, "\tPlatform " name, what) #define APPEND_DEVICE_STRING_INFO(id, name, what) \ APPEND_STRING_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what) +#define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \ + APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what) vector<cl_device_id> device_ids; for(cl_uint platform = 0; platform < num_platforms; ++platform) { @@ -167,6 +179,7 @@ string device_opencl_capabilities(void) result += string_printf("\t\tDevice: #%u\n", device); APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME); + APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD); APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR); APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION); APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE); diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index 981ec74fe56..d2b3a89fa98 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -19,6 +19,7 @@ #include "kernel/kernel_types.h" #include "kernel/split/kernel_split_data_types.h" +#include "util/util_logging.h" #include "util/util_time.h" CCL_NAMESPACE_BEGIN @@ -38,12 +39,15 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device) kernel_do_volume = NULL; kernel_queue_enqueue = NULL; kernel_indirect_background = NULL; + kernel_shader_setup = NULL; + kernel_shader_sort = NULL; kernel_shader_eval = NULL; kernel_holdout_emission_blurring_pathtermination_ao = NULL; kernel_subsurface_scatter = NULL; kernel_direct_lighting = NULL; kernel_shadow_blocked_ao = NULL; kernel_shadow_blocked_dl = NULL; + kernel_enqueue_inactive = NULL; kernel_next_iteration_setup = NULL; kernel_indirect_subsurface = NULL; kernel_buffer_update = NULL; @@ -63,12 +67,15 @@ DeviceSplitKernel::~DeviceSplitKernel() delete kernel_do_volume; delete kernel_queue_enqueue; delete kernel_indirect_background; + delete kernel_shader_setup; + delete kernel_shader_sort; delete kernel_shader_eval; delete kernel_holdout_emission_blurring_pathtermination_ao; delete kernel_subsurface_scatter; delete kernel_direct_lighting; delete kernel_shadow_blocked_ao; delete kernel_shadow_blocked_dl; + delete kernel_enqueue_inactive; delete kernel_next_iteration_setup; delete kernel_indirect_subsurface; delete kernel_buffer_update; @@ -88,12 +95,15 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe LOAD_KERNEL(do_volume); LOAD_KERNEL(queue_enqueue); LOAD_KERNEL(indirect_background); + LOAD_KERNEL(shader_setup); + LOAD_KERNEL(shader_sort); LOAD_KERNEL(shader_eval); LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); LOAD_KERNEL(subsurface_scatter); LOAD_KERNEL(direct_lighting); LOAD_KERNEL(shadow_blocked_ao); LOAD_KERNEL(shadow_blocked_dl); + LOAD_KERNEL(enqueue_inactive); LOAD_KERNEL(next_iteration_setup); LOAD_KERNEL(indirect_subsurface); LOAD_KERNEL(buffer_update); @@ -108,6 +118,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size) { uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; + VLOG(1) << "Split state element size: " + << string_human_readable_number(size_per_element) << " bytes. (" + << string_human_readable_size(size_per_element) << ")."; return max_buffer_size / size_per_element; } @@ -156,13 +169,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, unsigned int max_work_groups = num_global_elements / work_pool_size + 1; /* Allocate work_pool_wgs memory. */ - work_pool_wgs.resize(max_work_groups * sizeof(unsigned int)); + work_pool_wgs.resize(max_work_groups); device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE); - queue_index.resize(NUM_QUEUES * sizeof(int)); + queue_index.resize(NUM_QUEUES); device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE); - use_queues_flag.resize(sizeof(char)); + use_queues_flag.resize(1); device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE); ray_state.resize(num_global_elements); @@ -227,6 +240,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size); bool activeRaysAvailable = true; + double cancel_time = DBL_MAX; while(activeRaysAvailable) { /* Do path-iteration in host [Enqueue Path-iteration kernels. */ @@ -236,18 +250,29 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size); ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size); ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size); ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); - if(task->get_cancel()) { + if(task->get_cancel() && cancel_time == DBL_MAX) { + /* Wait up to twice as many seconds for current samples to finish + * to avoid artifacts in render result from ending too soon. + */ + cancel_time = time_dt() + 2.0 * time_multiplier; + } + + if(time_dt() > cancel_time) { return true; } } @@ -271,7 +296,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, } } - if(task->get_cancel()) { + if(time_dt() > cancel_time) { return true; } } diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 55548122c0c..9c42cb58520 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -61,12 +61,15 @@ private: SplitKernelFunction *kernel_do_volume; SplitKernelFunction *kernel_queue_enqueue; SplitKernelFunction *kernel_indirect_background; + SplitKernelFunction *kernel_shader_setup; + SplitKernelFunction *kernel_shader_sort; SplitKernelFunction *kernel_shader_eval; SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao; SplitKernelFunction *kernel_subsurface_scatter; SplitKernelFunction *kernel_direct_lighting; SplitKernelFunction *kernel_shadow_blocked_ao; SplitKernelFunction *kernel_shadow_blocked_dl; + SplitKernelFunction *kernel_enqueue_inactive; SplitKernelFunction *kernel_next_iteration_setup; SplitKernelFunction *kernel_indirect_subsurface; SplitKernelFunction *kernel_buffer_update; @@ -78,16 +81,16 @@ private: */ device_memory split_data; device_vector<uchar> ray_state; - device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */ + device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */ /* Flag to make sceneintersect and lampemission kernel use queues. */ - device_memory use_queues_flag; + device_only_memory<char> use_queues_flag; /* Approximate time it takes to complete one sample */ double avg_time_per_sample; /* Work pool with respect to each work group. */ - device_memory work_pool_wgs; + device_only_memory<unsigned int> work_pool_wgs; /* clos_max value for which the kernels have been loaded currently. */ int current_max_closure; @@ -122,7 +125,8 @@ public: device_memory& use_queues_flag, device_memory& work_pool_wgs) = 0; - virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0; + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&) = 0; virtual int2 split_kernel_local_size() = 0; virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0; }; diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index ca303365627..3bc4c310283 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size) if(type == SHADER) { num = min(shader_w, num); } - else if(type == PATH_TRACE) { + else if(type == RENDER) { } else { num = min(h, num); @@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) tasks.push_back(task); } } - else if(type == PATH_TRACE) { + else if(type == RENDER) { for(int i = 0; i < num; i++) tasks.push_back(*this); } @@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples) { - if((type != PATH_TRACE) && + if((type != RENDER) && (type != SHADER)) return; diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index feee89fd6e4..44a1efff1f5 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -34,7 +34,7 @@ class Tile; class DeviceTask : public Task { public: - typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type; + typedef enum { RENDER, FILM_CONVERT, SHADER } Type; Type type; int x, y, w, h; @@ -53,7 +53,7 @@ public: int passes_size; - explicit DeviceTask(Type type = PATH_TRACE); + explicit DeviceTask(Type type = RENDER); int get_subtask_count(int num, int max_size = 0); void split(list<DeviceTask>& tasks, int num, int max_size = 0); @@ -65,6 +65,16 @@ public: function<void(RenderTile&)> update_tile_sample; function<void(RenderTile&)> release_tile; function<bool(void)> get_cancel; + function<void(RenderTile*, Device*)> map_neighbor_tiles; + function<void(RenderTile*, Device*)> unmap_neighbor_tiles; + + int denoising_radius; + float denoising_strength; + float denoising_feature_strength; + bool denoising_relative_pca; + int pass_stride; + int pass_denoising_data; + int pass_denoising_clean; bool need_finish_queue; bool integrator_branched; diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 764216d0dfa..78ca377d933 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -17,6 +17,7 @@ #ifdef WITH_OPENCL #include "device/device.h" +#include "device/device_denoising.h" #include "util/util_map.h" #include "util/util_param.h" @@ -26,24 +27,24 @@ CCL_NAMESPACE_BEGIN +/* Disable workarounds, seems to be working fine on latest drivers. */ +#define CYCLES_DISABLE_DRIVER_WORKAROUNDS + /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */ #ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ # undef clEnqueueNDRangeKernel # define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ - clFinish(a); \ CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ clFinish(a); # undef clEnqueueWriteBuffer # define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ - clFinish(a); \ CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ clFinish(a); # undef clEnqueueReadBuffer # define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ - clFinish(a); \ CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ clFinish(a); #endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ @@ -86,7 +87,7 @@ public: string *error = NULL); static bool device_version_check(cl_device_id device, string *error = NULL); - static string get_hardware_id(string platform_name, + static string get_hardware_id(const string& platform_name, cl_device_id device_id); static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all = false); @@ -132,6 +133,13 @@ public: cl_int* error = NULL); static cl_device_type get_device_type(cl_device_id device_id); + static bool get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int* error = NULL); + + static int mem_address_alignment(cl_device_id device_id); + /* Get somewhat more readable device name. * Main difference is AMD OpenCL here which only gives code name * for the regular device name. This will give more sane device @@ -221,7 +229,7 @@ public: cl_int err = stmt; \ \ if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \ + string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -242,17 +250,17 @@ public: public: OpenCLProgram() : loaded(false), device(NULL) {} OpenCLProgram(OpenCLDeviceBase *device, - string program_name, - string kernel_name, - string kernel_build_options, + const string& program_name, + const string& kernel_name, + const string& kernel_build_options, bool use_stdout = true); ~OpenCLProgram(); void add_kernel(ustring name); void load(); - bool is_loaded() { return loaded; } - string get_log() { return log; } + bool is_loaded() const { return loaded; } + const string& get_log() const { return log; } void report_error(); cl_kernel operator()(); @@ -266,8 +274,8 @@ public: bool load_binary(const string& clbin, const string *debug_src = NULL); bool save_binary(const string& clbin); - void add_log(string msg, bool is_debug); - void add_error(string msg); + void add_log(const string& msg, bool is_debug); + void add_error(const string& msg); bool loaded; cl_program program; @@ -285,7 +293,7 @@ public: map<ustring, cl_kernel> kernels; }; - OpenCLProgram base_program; + OpenCLProgram base_program, denoising_program; typedef map<string, device_vector<uchar>*> ConstMemMap; typedef map<string, device_ptr> MemMap; @@ -323,6 +331,9 @@ public: void mem_copy_from(device_memory& mem, int y, int w, int h, int elem); void mem_zero(device_memory& mem); void mem_free(device_memory& mem); + + int mem_address_alignment(); + void const_copy_to(const char *name, void *host, size_t size); void tex_alloc(const char *name, device_memory& mem, @@ -331,12 +342,14 @@ public: void tex_free(device_memory& mem); size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, size_t w, size_t h); + void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1); void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask& task); + void denoise(RenderTile& tile, const DeviceTask& task); + class OpenCLDeviceTask : public DeviceTask { public: OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task) @@ -370,9 +383,51 @@ public: virtual void thread_run(DeviceTask * /*task*/) = 0; + virtual bool is_split_kernel() = 0; + protected: string kernel_build_options(const string *debug_src = NULL); + void mem_zero_kernel(device_ptr ptr, size_t size); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task); + bool denoising_construct_transform(DenoisingTask *task); + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task); + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task); + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task); + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task); + bool denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task); + + device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type); + void mem_free_sub_ptr(device_ptr ptr); + class ArgumentWrapper { public: ArgumentWrapper() : size(0), pointer(NULL) diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index 52d0662a8e3..509da7a0a84 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -20,6 +20,7 @@ #include "kernel/kernel_types.h" +#include "util/util_algorithm.h" #include "util/util_foreach.h" #include "util/util_logging.h" #include "util/util_md5.h" @@ -213,8 +214,24 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program.add_kernel(ustring("bake")); base_program.add_kernel(ustring("zero_buffer")); + denoising_program = OpenCLProgram(this, "denoising", "filter.cl", ""); + denoising_program.add_kernel(ustring("filter_divide_shadow")); + denoising_program.add_kernel(ustring("filter_get_feature")); + denoising_program.add_kernel(ustring("filter_detect_outliers")); + denoising_program.add_kernel(ustring("filter_combine_halves")); + denoising_program.add_kernel(ustring("filter_construct_transform")); + denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); + denoising_program.add_kernel(ustring("filter_nlm_blur")); + denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); + denoising_program.add_kernel(ustring("filter_nlm_update_output")); + denoising_program.add_kernel(ustring("filter_nlm_normalize")); + denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); + denoising_program.add_kernel(ustring("filter_finalize")); + denoising_program.add_kernel(ustring("filter_set_tiles")); + vector<OpenCLProgram*> programs; programs.push_back(&base_program); + programs.push_back(&denoising_program); /* Call actual class to fill the vector with its programs. */ if(!load_kernels(requested_features, programs)) { return false; @@ -260,6 +277,25 @@ void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryTyp size_t size = mem.memory_size(); + /* check there is enough memory available for the allocation */ + cl_ulong max_alloc_size = 0; + clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); + + if(DebugFlags().opencl.mem_limit) { + max_alloc_size = min(max_alloc_size, + cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); + } + + if(size > max_alloc_size) { + string error = "Scene too complex to fit in available memory."; + if(name != NULL) { + error += string_printf(" (allocating buffer %s failed.)", name); + } + set_error(error); + + return; + } + cl_mem_flags mem_flag; void *mem_ptr = NULL; @@ -322,37 +358,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in NULL, NULL)); } -void OpenCLDeviceBase::mem_zero(device_memory& mem) +void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size) { - if(mem.device_pointer) { - if(base_program.is_loaded()) { - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; - cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; + cl_mem d_buffer = CL_MEM_PTR(mem); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; - while(d_offset < mem.memory_size()) { - d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset); + while(d_offset < size) { + d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset); - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, - ckZeroBuffer, - 2, - NULL, - global_size, - NULL, - 0, - NULL, - NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - d_offset += d_size; - } + d_offset += d_size; + } +} + +void OpenCLDeviceBase::mem_zero(device_memory& mem) +{ + if(mem.device_pointer) { + if(base_program.is_loaded()) { + mem_zero_kernel(mem.device_pointer, mem.memory_size()); } if(mem.data_pointer) { @@ -396,6 +437,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem) } } +int OpenCLDeviceBase::mem_address_alignment() +{ + return OpenCLInfo::mem_address_alignment(cdDevice); +} + +device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type) +{ + cl_mem_flags mem_flag; + if(type == MEM_READ_ONLY) + mem_flag = CL_MEM_READ_ONLY; + else if(type == MEM_WRITE_ONLY) + mem_flag = CL_MEM_WRITE_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + cl_buffer_region info; + info.origin = mem.memory_elements_size(offset); + info.size = mem.memory_elements_size(size); + + device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), + mem_flag, + CL_BUFFER_CREATE_TYPE_REGION, + &info, + &ciErr); + opencl_assert_err(ciErr, "clCreateSubBuffer"); + return sub_buf; +} + +void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer) +{ + if(device_pointer && device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); + } +} + void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) { ConstMemMap::iterator i = const_mem_map.find(name); @@ -449,7 +525,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) return global_size + ((r == 0)? 0: group_size - r); } -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) +void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size) { size_t workgroup_size, max_work_items[3]; @@ -458,6 +534,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); + if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { + workgroup_size = max_workgroup_size; + } + /* Try to divide evenly over 2 dimensions. */ size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; @@ -543,6 +623,380 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); enqueue_kernel(ckFilmConvertKernel, d_w, d_h); } +bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) +{ + int4 rect = task->rect; + int w = rect.z-rect.x; + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr); + cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem out_mem = CL_MEM_PTR(out_ptr); + + mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h); + mem_zero_kernel(out_ptr, sizeof(float)*w*h); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); + cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, guide_mem, variance_mem, + difference, local_rect, w, 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference, blurDifference, local_rect, w, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, difference, local_rect, w, f); + kernel_set_args(ckNLMUpdateOutput, 0, + dx, dy, blurDifference, image_mem, + out_mem, weightAccum, local_rect, w, f); + + enqueue_kernel(ckNLMCalcDifference, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMCalcWeight, w, h); + enqueue_kernel(ckNLMBlur, w, h); + enqueue_kernel(ckNLMUpdateOutput, w, h); + } + + int4 local_rect = make_int4(0, 0, w, h); + kernel_set_args(ckNLMNormalize, 0, + out_mem, weightAccum, local_rect, w); + enqueue_kernel(ckNLMNormalize, w, h); + + return true; +} + +bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) +{ + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + + cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); + + kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + transform_mem, + rank_mem, + task->filter_area, + task->rect, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + + enqueue_kernel(ckFilterConstructTransform, + task->storage.w, + task->storage.h, + 256); + + return true; +} + +bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cl_mem color_mem = CL_MEM_PTR(color_ptr); + cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr); + cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr); + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + kernel_set_args(ckNLMCalcDifference, 0, + dx, dy, + color_mem, + color_variance_mem, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + a, task->nlm_k_2); + enqueue_kernel(ckNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + local_rect, + task->buffer.w, + f); + enqueue_kernel(ckNLMCalcWeight, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + /* Reuse previous arguments. */ + enqueue_kernel(ckNLMBlur, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + kernel_set_args(ckNLMConstructGramian, 0, + dx, dy, + blurDifference, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + local_rect, + task->reconstruction_state.filter_rect, + task->buffer.w, + task->buffer.h, + f, + task->buffer.pass_stride); + enqueue_kernel(ckNLMConstructGramian, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h, + 256); + } + + kernel_set_args(ckFinalize, 0, + task->buffer.w, + task->buffer.h, + output_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->filter_area, + task->reconstruction_state.buffer_params, + task->render_buffer.samples); + enqueue_kernel(ckFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + return true; +} + +bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); + + kernel_set_args(ckFilterCombineHalves, 0, + mean_mem, + variance_mem, + a_mem, + b_mem, + rect, + r); + enqueue_kernel(ckFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); + cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); + cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterDivideShadow, 0, + task->render_buffer.samples, + tiles_mem, + a_mem, + b_mem, + sample_variance_mem, + sv_variance_mem, + buffer_variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) +{ + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); + + char split_kernel = is_split_kernel()? 1 : 0; + kernel_set_args(ckFilterGetFeature, 0, + task->render_buffer.samples, + tiles_mem, + mean_offset, + variance_offset, + mean_mem, + variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset, + split_kernel); + enqueue_kernel(ckFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem depth_mem = CL_MEM_PTR(depth_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); + + kernel_set_args(ckFilterDetectOutliers, 0, + image_mem, + variance_mem, + depth_mem, + output_mem, + task->rect, + task->buffer.pass_stride); + enqueue_kernel(ckFilterDetectOutliers, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task) +{ + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE); + mem_copy_to(task->tiles_mem); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles")); + + kernel_set_args(ckFilterSetTiles, 0, tiles_mem); + for(int i = 0; i < 9; i++) { + cl_mem buffer_mem = CL_MEM_PTR(buffers[i]); + kernel_set_args(ckFilterSetTiles, i+1, buffer_mem); + } + + enqueue_kernel(ckFilterSetTiles, 1, 1); + + return true; +} + +void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task) +{ + DenoisingTask denoising(this); + + denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising); + denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); +} + void OpenCLDeviceBase::shader(DeviceTask& task) { /* cast arguments to cl types */ @@ -612,7 +1066,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task) string OpenCLDeviceBase::kernel_build_options(const string *debug_src) { - string build_options = "-cl-fast-relaxed-math "; + string build_options = "-cl-no-signed-zeros -cl-mad-enable "; if(platform_name == "NVIDIA CUDA") { build_options += "-D__KERNEL_OPENCL_NVIDIA__ " @@ -792,7 +1246,7 @@ void OpenCLDeviceBase::store_cached_kernel( } string OpenCLDeviceBase::build_options_for_base_program( - const DeviceRequestedFeatures& /*requested_features*/) + const DeviceRequestedFeatures& requested_features) { /* TODO(sergey): By default we compile all features, meaning * mega kernel is not getting feature-based optimizations. @@ -800,6 +1254,14 @@ string OpenCLDeviceBase::build_options_for_base_program( * Ideally we need always compile kernel with as less features * enabled as possible to keep performance at it's max. */ + + /* For now disable baking when not in use as this has major + * impact on kernel build times. + */ + if(!requested_features.use_baking) { + return "-D__NO_BAKING__"; + } + return ""; } diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index a2fd1d71156..06c15bcf401 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -108,41 +108,53 @@ public: else if(task->type == DeviceTask::SHADER) { shader(*task); } - else if(task->type == DeviceTask::PATH_TRACE) { + else if(task->type == DeviceTask::RENDER) { RenderTile tile; /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + if(tile.task == RenderTile::PATH_TRACE) { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } + + path_trace(tile, sample); - path_trace(tile, sample); + tile.sample = sample + 1; - tile.sample = sample + 1; + task->update_progress(&tile, tile.w*tile.h); + } + /* Complete kernel execution before release tile */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, *task); task->update_progress(&tile, tile.w*tile.h); } - /* Complete kernel execution before release tile */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - task->release_tile(tile); } } } + + bool is_split_kernel() + { + return false; + } }; Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background) diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index b8df57ec7b9..76d9983e9a2 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -25,6 +25,7 @@ #include "device/device_split_kernel.h" +#include "util/util_algorithm.h" #include "util/util_logging.h" #include "util/util_md5.h" #include "util/util_path.h" @@ -70,6 +71,10 @@ public: delete split_kernel; } + virtual bool show_samples() const { + return true; + } + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, vector<OpenCLDeviceBase::OpenCLProgram*> &programs) { @@ -100,7 +105,7 @@ public: else if(task->type == DeviceTask::SHADER) { shader(*task); } - else if(task->type == DeviceTask::PATH_TRACE) { + else if(task->type == DeviceTask::RENDER) { RenderTile tile; /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to @@ -123,21 +128,29 @@ public: /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { - split_kernel->path_trace(task, - tile, - kgbuffer, - *const_mem_map["__data"]); - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); + if(tile.task == RenderTile::PATH_TRACE) { + assert(tile.task == RenderTile::PATH_TRACE); + split_kernel->path_trace(task, + tile, + kgbuffer, + *const_mem_map["__data"]); + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, *task); + task->update_progress(&tile, tile.w*tile.h); + } task->release_tile(tile); } @@ -146,6 +159,11 @@ public: } } + bool is_split_kernel() + { + return true; + } + protected: /* ** Those guys are for workign around some compiler-specific bugs ** */ @@ -159,17 +177,62 @@ protected: friend class OpenCLSplitKernelFunction; }; +struct CachedSplitMemory { + int id; + device_memory *split_data; + device_memory *ray_state; + device_ptr *rng_state; + device_memory *queue_index; + device_memory *use_queues_flag; + device_memory *work_pools; + device_ptr *buffer; +}; + class OpenCLSplitKernelFunction : public SplitKernelFunction { public: OpenCLDeviceSplitKernel* device; OpenCLDeviceBase::OpenCLProgram program; + CachedSplitMemory& cached_memory; + int cached_id; + + OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) : + device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1) + { + } - OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {} - ~OpenCLSplitKernelFunction() { program.release(); } + ~OpenCLSplitKernelFunction() + { + program.release(); + } virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) { - device->kernel_set_args(program(), 0, kg, data); + if(cached_id != cached_memory.id) { + cl_uint start_arg_index = + device->kernel_set_args(program(), + 0, + kg, + data, + *cached_memory.split_data, + *cached_memory.ray_state, + *cached_memory.rng_state); + +/* TODO(sergey): Avoid map lookup here. */ +#define KERNEL_TEX(type, ttype, name) \ + device->set_kernel_arg_mem(program(), &start_arg_index, #name); +#include "kernel/kernel_textures.h" +#undef KERNEL_TEX + + start_arg_index += + device->kernel_set_args(program(), + start_arg_index, + *cached_memory.queue_index, + *cached_memory.use_queues_flag, + *cached_memory.work_pools, + *cached_memory.buffer); + + cached_id = cached_memory.id; + } device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, program(), @@ -196,14 +259,15 @@ public: class OpenCLSplitKernel : public DeviceSplitKernel { OpenCLDeviceSplitKernel *device; + CachedSplitMemory cached_memory; public: explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) { } - virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, const DeviceRequestedFeatures& requested_features) { - OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device); + OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory); bool single_program = OpenCLInfo::use_single_program(); kernel->program = @@ -332,6 +396,15 @@ public: return false; } + cached_memory.split_data = &split_data; + cached_memory.ray_state = &ray_state; + cached_memory.rng_state = &rtile.rng_state; + cached_memory.queue_index = &queue_index; + cached_memory.use_queues_flag = &use_queues_flag; + cached_memory.work_pools = &work_pool_wgs; + cached_memory.buffer = &rtile.buffer; + cached_memory.id++; + return true; } @@ -351,12 +424,18 @@ public: cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if(DebugFlags().opencl.mem_limit) { + max_buffer_size = min(max_buffer_size, + cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); + } + VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2); - int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements)); + int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; } diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index fe1c65a2224..0d34af3e040 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -241,9 +241,9 @@ string OpenCLCache::get_kernel_md5() } OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device, - string program_name, - string kernel_file, - string kernel_build_options, + const string& program_name, + const string& kernel_file, + const string& kernel_build_options, bool use_stdout) : device(device), program_name(program_name), @@ -274,7 +274,7 @@ void OpenCLDeviceBase::OpenCLProgram::release() } } -void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug) +void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug) { if(!use_stdout) { log += msg + "\n"; @@ -288,7 +288,7 @@ void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug) } } -void OpenCLDeviceBase::OpenCLProgram::add_error(string msg) +void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg) { if(use_stdout) { fprintf(stderr, "%s\n", msg.c_str()); @@ -608,6 +608,14 @@ bool OpenCLInfo::device_supported(const string& platform_name, if(!get_device_name(device_id, &device_name)) { return false; } + + int driver_major = 0; + int driver_minor = 0; + if(!get_driver_version(device_id, &driver_major, &driver_minor)) { + return false; + } + VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; + /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework * (aka, it will not be on Intel framework). This isn't supported * and needs an explicit blacklist. @@ -618,6 +626,21 @@ bool OpenCLInfo::device_supported(const string& platform_name, if(platform_name == "AMD Accelerated Parallel Processing" && device_type == CL_DEVICE_TYPE_GPU) { + if(driver_major < 2236) { + VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; + return false; + } + const char *blacklist[] = { + /* GCN 1 */ + "Tahiti", "Pitcairn", "Capeverde", "Oland", + NULL + }; + for (int i = 0; blacklist[i] != NULL; i++) { + if(device_name == blacklist[i]) { + VLOG(1) << "AMD device " << device_name << " not supported"; + return false; + } + } return true; } if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { @@ -684,7 +707,7 @@ bool OpenCLInfo::device_version_check(cl_device_id device, return true; } -string OpenCLInfo::get_hardware_id(string platform_name, cl_device_id device_id) +string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id) { if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { /* Use cl_amd_device_topology extension. */ @@ -902,7 +925,7 @@ bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string OpenCLInfo::get_platform_name(cl_platform_id platform_id) { string platform_name; - if (!get_platform_name(platform_id, &platform_name)) { + if(!get_platform_name(platform_id, &platform_name)) { return ""; } return platform_name; @@ -1063,7 +1086,7 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id) CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, - &length) == CL_SUCCESS) + &length) == CL_SUCCESS) { if(length != 0 && board_name[0] != '\0') { return board_name; @@ -1073,6 +1096,48 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id) return get_device_name(device_id); } +bool OpenCLInfo::get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int* error) +{ + char buffer[1024]; + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DRIVER_VERSION, + sizeof(buffer), + &buffer, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + if(sscanf(buffer, "%d.%d", major, minor) < 2) { + VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); + return false; + } + return true; +} + +int OpenCLInfo::mem_address_alignment(cl_device_id device_id) +{ + int base_align_bits; + if(clGetDeviceInfo(device_id, + CL_DEVICE_MEM_BASE_ADDR_ALIGN, + sizeof(int), + &base_align_bits, + NULL) == CL_SUCCESS) + { + return base_align_bits/8; + } + return 1; +} + CCL_NAMESPACE_END #endif diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index dbc2ba2503a..23e9bd311c4 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -10,7 +10,23 @@ set(INC_SYS set(SRC kernels/cpu/kernel.cpp + kernels/cpu/kernel_sse2.cpp + kernels/cpu/kernel_sse3.cpp + kernels/cpu/kernel_sse41.cpp + kernels/cpu/kernel_avx.cpp + kernels/cpu/kernel_avx2.cpp kernels/cpu/kernel_split.cpp + kernels/cpu/kernel_split_sse2.cpp + kernels/cpu/kernel_split_sse3.cpp + kernels/cpu/kernel_split_sse41.cpp + kernels/cpu/kernel_split_avx.cpp + kernels/cpu/kernel_split_avx2.cpp + kernels/cpu/filter.cpp + kernels/cpu/filter_sse2.cpp + kernels/cpu/filter_sse3.cpp + kernels/cpu/filter_sse41.cpp + kernels/cpu/filter_avx.cpp + kernels/cpu/filter_avx2.cpp kernels/opencl/kernel.cl kernels/opencl/kernel_state_buffer_size.cl kernels/opencl/kernel_split.cl @@ -21,17 +37,22 @@ set(SRC kernels/opencl/kernel_lamp_emission.cl kernels/opencl/kernel_do_volume.cl kernels/opencl/kernel_indirect_background.cl + kernels/opencl/kernel_shader_setup.cl + kernels/opencl/kernel_shader_sort.cl kernels/opencl/kernel_shader_eval.cl kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl kernels/opencl/kernel_subsurface_scatter.cl kernels/opencl/kernel_direct_lighting.cl kernels/opencl/kernel_shadow_blocked_ao.cl kernels/opencl/kernel_shadow_blocked_dl.cl + kernels/opencl/kernel_enqueue_inactive.cl kernels/opencl/kernel_next_iteration_setup.cl kernels/opencl/kernel_indirect_subsurface.cl kernels/opencl/kernel_buffer_update.cl + kernels/opencl/filter.cl kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu + kernels/cuda/filter.cu ) set(SRC_BVH_HEADERS @@ -93,12 +114,18 @@ set(SRC_KERNELS_CPU_HEADERS kernels/cpu/kernel_cpu.h kernels/cpu/kernel_cpu_impl.h kernels/cpu/kernel_cpu_image.h + kernels/cpu/filter_cpu.h + kernels/cpu/filter_cpu_impl.h ) set(SRC_KERNELS_CUDA_HEADERS kernels/cuda/kernel_config.h ) +set(SRC_KERNELS_OPENCL_HEADERS + kernels/opencl/kernel_split_function.h +) + set(SRC_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -120,6 +147,8 @@ set(SRC_CLOSURE_HEADERS closure/bssrdf.h closure/emissive.h closure/volume.h + closure/bsdf_principled_diffuse.h + closure/bsdf_principled_sheen.h ) set(SRC_SVM_HEADERS @@ -186,6 +215,21 @@ set(SRC_GEOM_HEADERS geom/geom_volume.h ) +set(SRC_FILTER_HEADERS + filter/filter.h + filter/filter_defines.h + filter/filter_features.h + filter/filter_features_sse.h + filter/filter_kernel.h + filter/filter_nlm_cpu.h + filter/filter_nlm_gpu.h + filter/filter_prefilter.h + filter/filter_reconstruction.h + filter/filter_transform.h + filter/filter_transform_gpu.h + filter/filter_transform_sse.h +) + set(SRC_UTIL_HEADERS ../util/util_atomic.h ../util/util_color.h @@ -194,17 +238,52 @@ set(SRC_UTIL_HEADERS ../util/util_math.h ../util/util_math_fast.h ../util/util_math_intersect.h + ../util/util_math_float2.h + ../util/util_math_float3.h + ../util/util_math_float4.h + ../util/util_math_int2.h + ../util/util_math_int3.h + ../util/util_math_int4.h + ../util/util_math_matrix.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h ../util/util_types.h + ../util/util_types_float2.h + ../util/util_types_float2_impl.h + ../util/util_types_float3.h + ../util/util_types_float3_impl.h + ../util/util_types_float4.h + ../util/util_types_float4_impl.h + ../util/util_types_int2.h + ../util/util_types_int2_impl.h + ../util/util_types_int3.h + ../util/util_types_int3_impl.h + ../util/util_types_int4.h + ../util/util_types_int4_impl.h + ../util/util_types_uchar2.h + ../util/util_types_uchar2_impl.h + ../util/util_types_uchar3.h + ../util/util_types_uchar3_impl.h + ../util/util_types_uchar4.h + ../util/util_types_uchar4_impl.h + ../util/util_types_uint2.h + ../util/util_types_uint2_impl.h + ../util/util_types_uint3.h + ../util/util_types_uint3_impl.h + ../util/util_types_uint4.h + ../util/util_types_uint4_impl.h + ../util/util_types_vector3.h + ../util/util_types_vector3_impl.h ) set(SRC_SPLIT_HEADERS + split/kernel_branched.h split/kernel_buffer_update.h split/kernel_data_init.h split/kernel_direct_lighting.h split/kernel_do_volume.h + split/kernel_enqueue_inactive.h split/kernel_holdout_emission_blurring_pathtermination_ao.h split/kernel_indirect_background.h split/kernel_indirect_subsurface.h @@ -213,6 +292,8 @@ set(SRC_SPLIT_HEADERS split/kernel_path_init.h split/kernel_queue_enqueue.h split/kernel_scene_intersect.h + split/kernel_shader_setup.h + split/kernel_shader_sort.h split/kernel_shader_eval.h split/kernel_shadow_blocked_ao.h split/kernel_shadow_blocked_dl.h @@ -256,23 +337,21 @@ if(WITH_CYCLES_CUDA_BINARIES) ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS} ) + set(cuda_filter_sources kernels/cuda/filter.cu + ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_FILTER_HEADERS} + ${SRC_UTIL_HEADERS} + ) set(cuda_cubins) - macro(CYCLES_CUDA_KERNEL_ADD arch split experimental) - if(${split}) - set(cuda_extra_flags "-D__SPLIT__") - set(cuda_cubin kernel_split) - else() - set(cuda_extra_flags "") - set(cuda_cubin kernel) - endif() - + macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental) if(${experimental}) - set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__) - set(cuda_cubin ${cuda_cubin}_experimental) + set(flags ${flags} -D__KERNEL_EXPERIMENTAL__) + set(name ${name}_experimental) endif() - set(cuda_cubin ${cuda_cubin}_${arch}.cubin) + set(cuda_cubin ${name}_${arch}.cubin) if(WITH_CYCLES_DEBUG) set(cuda_debug_flags "-D__KERNEL_DEBUG__") @@ -286,11 +365,7 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") set(cuda_math_flags "--use_fast_math") - if(split) - set(cuda_kernel_src "/kernels/cuda/kernel_split.cu") - else() - set(cuda_kernel_src "/kernels/cuda/kernel.cu") - endif() + set(cuda_kernel_src "/kernels/cuda/${name}.cu") add_custom_command( OUTPUT ${cuda_cubin} @@ -304,13 +379,13 @@ if(WITH_CYCLES_CUDA_BINARIES) ${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} - ${cuda_extra_flags} + ${flags} ${cuda_debug_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/.. -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC - DEPENDS ${cuda_sources}) + DEPENDS ${sources}) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) list(APPEND cuda_cubins ${cuda_cubin}) @@ -324,11 +399,12 @@ if(WITH_CYCLES_CUDA_BINARIES) foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE) if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) # Compile split kernel - CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE) endif() endforeach() @@ -349,41 +425,30 @@ include_directories(SYSTEM ${INC_SYS}) set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") if(CXX_HAS_SSE) - list(APPEND SRC - kernels/cpu/kernel_sse2.cpp - kernels/cpu/kernel_sse3.cpp - kernels/cpu/kernel_sse41.cpp - kernels/cpu/kernel_split_sse2.cpp - kernels/cpu/kernel_split_sse3.cpp - kernels/cpu/kernel_split_sse41.cpp - ) - set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) - list(APPEND SRC - kernels/cpu/kernel_avx.cpp - kernels/cpu/kernel_split_avx.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) - list(APPEND SRC - kernels/cpu/kernel_avx2.cpp - kernels/cpu/kernel_split_avx2.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() add_library(cycles_kernel @@ -391,8 +456,10 @@ add_library(cycles_kernel ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_KERNELS_OPENCL_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} + ${SRC_FILTER_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_SPLIT_HEADERS} @@ -422,21 +489,28 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_interse delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_sort.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inactive.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 9139b99353a..86a00d2124d 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -27,6 +27,8 @@ #include "kernel/closure/bsdf_ashikhmin_shirley.h" #include "kernel/closure/bsdf_toon.h" #include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" #ifdef __SUBSURFACE__ # include "kernel/closure/bssrdf.h" #endif @@ -86,16 +88,21 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; @@ -130,6 +137,17 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: @@ -188,14 +206,19 @@ float3 bsdf_eval(KernelGlobals *kg, eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: @@ -222,6 +245,15 @@ float3 bsdf_eval(KernelGlobals *kg, case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf); break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); + break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: @@ -256,14 +288,19 @@ float3 bsdf_eval(KernelGlobals *kg, eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: @@ -290,6 +327,15 @@ float3 bsdf_eval(KernelGlobals *kg, case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf); break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); + break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: @@ -311,11 +357,16 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) #ifdef __SVM__ switch(sc->type) { case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: bsdf_microfacet_multi_ggx_blur(sc, roughness); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: bsdf_microfacet_ggx_blur(sc, roughness); break; @@ -349,10 +400,15 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) case CLOSURE_BSDF_REFLECTION_ID: case CLOSURE_BSDF_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: @@ -367,6 +423,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: return bsdf_hair_merge(a, b); +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + return bsdf_principled_diffuse_merge(a, b); +#endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: return volume_henyey_greenstein_merge(a, b); @@ -379,5 +440,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) #endif } +/* Classifies a closure as diffuse-like or specular-like. + * This is needed for the denoising feature pass generation, + * which are written on the first bounce where more than 25% + * of the sampling weight belongs to diffuse-line closures. */ +ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc) +{ + if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + return true; + } + + if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc; + return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f); + } + + return false; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h index 7e0f5a7ec75..a5ba2cb2972 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h @@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf { float sigma; float invsigma2; - float3 N; } VelvetBsdf; ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf) diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h index dcd187f9305..ec6f1f20996 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse.h @@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseBsdf { SHADER_CLOSURE_BASE; - float3 N; } DiffuseBsdf; /* DIFFUSE */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h index 2d982a95fe4..24f40af46a3 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float3 *colors; } DiffuseRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 1c7b3eb9ddd..b12e248f0a3 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -36,7 +36,8 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct MicrofacetExtra { - float3 color; + float3 color, cspec0; + float clearcoat; } MicrofacetExtra; typedef ccl_addr_space struct MicrofacetBsdf { @@ -45,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf { float alpha_x, alpha_y, ior; MicrofacetExtra *extra; float3 T; - float3 N; } MicrofacetBsdf; /* Beckmann and GGX microfacet importance sampling. */ @@ -233,6 +233,36 @@ ccl_device_forceinline float3 microfacet_sample_stretched( return normalize(make_float3(-slope_x, -slope_y, 1.0f)); } +/* Calculate the reflection color + * + * If fresnel is used, the color is an interpolation of the F0 color and white + * with respect to the fresnel + * + * Else it is simply white + */ +ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float3 L, float3 H) { + float3 F = make_float3(1.0f, 1.0f, 1.0f); + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID); + + if(use_fresnel) { + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + + F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0); + } + + return F; +} + +ccl_device_forceinline float D_GTR1(float NdotH, float alpha) +{ + if(alpha >= 1.0f) return M_1_PI_F; + float alpha2 = alpha*alpha; + float t = 1.0f + (alpha2 - 1.0f) * NdotH*NdotH; + return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t); +} + /* GGX microfacet with Smith shadow-masking from: * * Microfacet Models for Refraction through Rough Surfaces @@ -248,14 +278,52 @@ ccl_device_forceinline float3 microfacet_sample_stretched( ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; - + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID; return SD_BSDF|SD_BSDF_HAS_EVAL; } +ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b) { const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a; @@ -273,16 +341,38 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = saturate(bsdf->alpha_y); - + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID; return SD_BSDF|SD_BSDF_HAS_EVAL; } +ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = saturate(bsdf->alpha_y); + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; @@ -319,6 +409,8 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons float alpha2 = alpha_x * alpha_y; float D, G1o, G1i; + bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID); + if(alpha_x == alpha_y) { /* isotropic * eq. 20: (F*G*D)/(4*in*on) @@ -327,7 +419,18 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; - D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + if(is_principled_clearcoat) { + /* use GTR1 for clearcoat */ + D = D_GTR1(cosThetaM, bsdf->alpha_x); + + /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */ + alpha2 = 0.0625f; + } + else { + /* use GTR2 otherwise */ + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + } /* eq. 34: now calculate G1(i,m) and G1(o,m) */ G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); @@ -374,7 +477,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons /* eq. 20 */ float common = D * 0.25f / cosNO; - float out = G * common; + + float3 F = reflection_color(bsdf, omega_in, m); + if(is_principled_clearcoat) { + F *= 0.25f * bsdf->extra->clearcoat; + } + + float3 out = F * G * common; /* eq. 2 in distribution of visible normals sampling * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ @@ -384,7 +493,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons * pdf = pm * 0.25 / dot(m, I); */ *pdf = G1o * common; - return make_float3(out, out, out); + return out; } return make_float3(0.0f, 0.0f, 0.0f); @@ -489,6 +598,17 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); + + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID); + + /* if fresnel is used, calculate the color with reflection_color(...) */ + if(use_fresnel) { + *pdf = 1.0f; + *eval = reflection_color(bsdf, *omega_in, m); + } + label = LABEL_REFLECT | LABEL_SINGULAR; } else { @@ -497,16 +617,32 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure float alpha2 = alpha_x * alpha_y; float D, G1i; + bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID); + if(alpha_x == alpha_y) { /* isotropic */ float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; float tanThetaM2 = 1/(cosThetaM2) - 1; - D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); /* eval BRDF*cosNI */ float cosNI = dot(N, *omega_in); + if(is_principled_clearcoat) { + /* use GTR1 for clearcoat */ + D = D_GTR1(cosThetaM, bsdf->alpha_x); + + /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */ + alpha2 = 0.0625f; + + /* recalculate G1o */ + G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + } + else { + /* use GTR2 otherwise */ + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + } + /* eq. 34: now calculate G1(i,m) */ G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); } @@ -538,10 +674,14 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* see eval function for derivation */ float common = (G1o * D) * 0.25f / cosNO; - float out = G1i * common; *pdf = common; - *eval = make_float3(out, out, out); + float3 F = reflection_color(bsdf, *omega_in, m); + if(is_principled_clearcoat) { + F *= 0.25f * bsdf->extra->clearcoat; + } + + *eval = G1i * common * F; } #ifdef __RAY_DIFFERENTIALS__ diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index 7d87727004f..2f2c35d5d1f 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) } /* Sample slope distribution (based on page 14 of the supplemental implementation). */ -ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) +ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy) { - if(cosI > 0.9999f || cosI < 1e-6f) { - const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f)); - const float phi = M_2PI_F * randU.y; + if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) { + const float r = sqrtf(randx / max(1.0f - randx, 1e-7f)); + const float phi = M_2PI_F * randy; return make_float2(r*cosf(phi), r*sinf(phi)); } - const float sinI = sqrtf(1.0f - cosI*cosI); + const float sinI = safe_sqrtf(1.0f - cosI*cosI); const float tanI = sinI/cosI; const float projA = 0.5f * (cosI + 1.0f); if(projA < 0.0001f) return make_float2(0.0f, 0.0f); - const float A = 2.0f*randU.x*projA / cosI - 1.0f; + const float A = 2.0f*randx*projA / cosI - 1.0f; float tmp = A*A-1.0f; if(fabsf(tmp) < 1e-7f) return make_float2(0.0f, 0.0f); @@ -64,24 +64,24 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2; float U2; - if(randU.y >= 0.5f) - U2 = 2.0f*(randU.y - 0.5f); + if(randy >= 0.5f) + U2 = 2.0f*(randy - 0.5f); else - U2 = 2.0f*(0.5f - randU.y); + U2 = 2.0f*(0.5f - randy); const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f); const float slopeY = z * sqrtf(1.0f + slopeX*slopeX); - if(randU.y >= 0.5f) + if(randy >= 0.5f) return make_float2(slopeX, slopeY); else return make_float2(slopeX, -slopeY); } /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */ -ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU) +ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy) { const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z)); - const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU); + const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy); const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f)); const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y); @@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha return normalize(make_float3(-slope_x, -slope_y, 1.0f)); } -/* === Phase functions: Glossy, Diffuse and Glass === */ +/* === Phase functions: Glossy and Glass === */ -/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */ -ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm) +/* Phase function for reflective materials. */ +ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm) { - if(n && k) - *weight *= fresnel_conductor(dot(wi, wm), *n, *k); - return -wi + 2.0f * wm * dot(wi, wm); } -ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k) +ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha) { if(w.z > 0.9999f) return make_float3(0.0f, 0.0f, 0.0f); @@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l else phase *= D_ggx_aniso(wh, alpha); - if(n && k) { - /* Apply conductive fresnel term. */ - return phase * fresnel_conductor(dotW_WH, *n, *k); - } - return make_float3(phase, phase, phase); } -/* Phase function for rough lambertian diffuse surfaces. */ -ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv) -{ - float3 tm, bm; - make_orthonormals(wm, &tm, &bm); - - float2 disk = concentric_sample_disk(randu, randv); - return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm; -} - -ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm) -{ - const float v = max(0.0f, dot(w, wm)) * M_1_PI_F; - return make_float3(v, v, v); -} - /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */ ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside) { @@ -269,40 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r) return saturate(albedo); } +ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior) +{ + if(ior < 1.0f) { + ior = 1.0f/ior; + } + a = saturate(a); + ior = clamp(ior, 1.0f, 3.0f); + float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f; + float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f; + float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior); + float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f; + + return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f); +} + ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha) { float D = D_ggx(normalize(wi+wo), alpha); float lambda = mf_lambda(wi, make_float2(alpha, alpha)); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); + + float multiscatter = wo.z * M_1_PI_F; + float albedo = mf_ggx_albedo(alpha); - return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z; + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha) { - return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z; -} + float D = D_ggx_aniso(normalize(wi+wo), alpha); + float lambda = mf_lambda(wi, alpha); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); -ccl_device_forceinline float mf_diffuse_pdf(const float3 wo) -{ - return M_1_PI_F * wo.z; + float multiscatter = wo.z * M_1_PI_F; + + float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y)); + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta) { - float3 wh; - float fresnel; - if(wi.z*wo.z > 0.0f) { - wh = normalize(wi + wo); - fresnel = fresnel_dielectric_cos(dot(wi, wh), eta); - } - else { - wh = normalize(wi + wo*eta); - fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta); - } + bool reflective = (wi.z*wo.z > 0.0f); + + float wh_len; + float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len); if(wh.z < 0.0f) wh = -wh; float3 r_wi = (wi.z < 0.0f)? -wi: wi; - return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z); + float lambda = mf_lambda(r_wi, make_float2(alpha, alpha)); + float D = D_ggx(wh, alpha); + float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta); + + float multiscatter = fabsf(wo.z * M_1_PI_F); + if(reflective) { + float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f); + float albedo = mf_ggx_albedo(alpha); + return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } + else { + float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f)); + float albedo = mf_ggx_transmission_albedo(alpha, eta); + return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } } /* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */ @@ -315,13 +320,6 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons #define MF_MULTI_GLASS #include "kernel/closure/bsdf_microfacet_multi_impl.h" -/* The diffuse phase function is not implemented as a node yet. */ -#if 0 -#define MF_PHASE_FUNCTION diffuse -#define MF_MULTI_DIFFUSE -#include "kernel/closure/bsdf_microfacet_multi_impl.h" -#endif - #define MF_PHASE_FUNCTION glossy #define MF_MULTI_GLOSSY #include "kernel/closure/bsdf_microfacet_multi_impl.h" @@ -345,8 +343,9 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf) bsdf->extra->color.x = saturate(bsdf->extra->color.x); bsdf->extra->color.y = saturate(bsdf->extra->color.y); bsdf->extra->color.z = saturate(bsdf->extra->color.z); - - bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } @@ -356,6 +355,22 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf) if(is_zero(bsdf->T)) bsdf->T = make_float3(1.0f, 0.0f, 0.0f); + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + if(is_zero(bsdf->T)) + bsdf->T = make_float3(1.0f, 0.0f, 0.0f); + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -363,6 +378,30 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf) { bsdf->alpha_y = bsdf->alpha_x; + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(MicrofacetBsdf *bsdf) +{ + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -378,6 +417,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc return make_float3(0.0f, 0.0f, 0.0f); } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID); + bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y); float3 X, Y, Z; Z = bsdf->N; @@ -393,7 +434,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc *pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y)); else *pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x); - return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL); + return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); } ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state) @@ -407,9 +448,15 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *omega_in = 2*dot(Z, I)*Z - I; *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); +#ifdef __RAY_DIFFERENTIALS__ + *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; + *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; +#endif return LABEL_REFLECT|LABEL_SINGULAR; } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID); + bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y); if(is_aniso) make_orthonormals_tangent(Z, bsdf->T, &X, &Y); @@ -419,7 +466,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z)); float3 localO; - *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL); + *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); if(is_aniso) *pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y)); else @@ -427,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *eval *= *pdf; *omega_in = X*localO.x + Y*localO.y + Z*localO.z; + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; @@ -450,6 +498,27 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf) return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } +ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f); + bsdf->alpha_y = bsdf->alpha_x; + bsdf->ior = max(0.0f, bsdf->ior); + bsdf->extra->color.x = saturate(bsdf->extra->color.x); + bsdf->extra->color.y = saturate(bsdf->extra->color.y); + bsdf->extra->color.z = saturate(bsdf->extra->color.z); + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; +} + ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) { const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc; @@ -465,7 +534,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z)); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); - return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, false, bsdf->extra->color); } ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) { @@ -475,6 +544,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu return make_float3(0.0f, 0.0f, 0.0f); } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID); + float3 X, Y, Z; Z = bsdf->N; make_orthonormals(Z, &X, &Y); @@ -483,7 +554,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z)); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); - return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); } ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state) @@ -525,12 +596,14 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S } } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID); + make_orthonormals(Z, &X, &Y); float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z)); float3 localO; - *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); *eval *= *pdf; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index 8054fa8e849..e73915dbda7 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -26,19 +26,16 @@ * the balance heuristic isn't necessarily optimal anymore. */ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( - float3 wi, - float3 wo, - const bool wo_outside, - const float3 color, - const float alpha_x, - const float alpha_y, - ccl_addr_space uint *lcg_state -#ifdef MF_MULTI_GLASS - , const float eta -#elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k -#endif -) + float3 wi, + float3 wo, + const bool wo_outside, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state, + const float eta, + bool use_fresnel, + const float3 cspec0) { /* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */ bool swapped = false; @@ -71,50 +68,57 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( /* Analytically compute single scattering for lower noise. */ float3 eval; + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + const float3 wh = normalize(wi+wo); #ifdef MF_MULTI_GLASS eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta); if(wo_outside) eval *= -lambda_r / (shadowing_lambda - lambda_r); else eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f); -#elif defined(MF_MULTI_DIFFUSE) - /* Diffuse has no special closed form for the single scattering bounce */ - eval = make_float3(0.0f, 0.0f, 0.0f); #else /* MF_MULTI_GLOSSY */ - const float3 wh = normalize(wi+wo); const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda); float val = G2 * 0.25f / wi.z; if(alpha.x == alpha.y) val *= D_ggx(wh, alpha.x); else val *= D_ggx_aniso(wh, alpha); - if(n && k) { - eval = fresnel_conductor(dot(wh, wi), *n, *k) * val; - } - else { - eval = make_float3(val, val, val); - } + eval = make_float3(val, val, val); #endif + float F0 = fresnel_dielectric_cos(1.0f, eta); + if(use_fresnel) { + throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0); + + eval *= throughput; + } + float3 wr = -wi; float hr = 1.0f; float C1_r = 1.0f; float G1_r = 0.0f; bool outside = true; - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); for(int order = 0; order < 10; order++) { - /* Sample microfacet height and normal */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) + /* Sample microfacet height. */ + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) break; - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); - -#ifdef MF_MULTI_DIFFUSE - if(order == 0) { - /* Compute single-scattering for diffuse. */ - const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r); - eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm); + /* Sample microfacet normal. */ + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); + +#ifdef MF_MULTI_GLASS + if(order == 0 && use_fresnel) { + /* Evaluate amount of scattering towards wo on this microfacet. */ + float3 phase; + if(outside) + phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta); + else + phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f / eta); + + eval = throughput * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda); } #endif if(order > 0) { @@ -125,10 +129,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta); else phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta); -#elif defined(MF_MULTI_DIFFUSE) - phase = mf_eval_phase_diffuse(wo, wm); #else /* MF_MULTI_GLOSSY */ - phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput; + phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput; #endif eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda); } @@ -136,23 +138,32 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( /* Bounce from the microfacet. */ #ifdef MF_MULTI_GLASS bool next_outside; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float3 wi_prev = -wr; + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { outside = !outside; wr = -wr; hr = -hr; } -#elif defined(MF_MULTI_DIFFUSE) - wr = mf_sample_phase_diffuse(wm, - lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state)); + + if(use_fresnel && !next_outside) { + throughput *= color; + } + else if(use_fresnel && order > 0) { + throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0); + } #else /* MF_MULTI_GLOSSY */ - wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm); + if(use_fresnel && order > 0) { + throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); + } + wr = mf_sample_phase_glossy(-wr, &throughput, wm); #endif lambda_r = mf_lambda(wr, alpha); - throughput *= color; + if(!use_fresnel) + throughput *= color; C1_r = mf_C1(hr); G1_r = mf_G1(wr, C1_r, lambda_r); @@ -168,13 +179,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( * escaped the surface in wo. The function returns the throughput between wi and wo. * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal. */ -ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state -#ifdef MF_MULTI_GLASS - , const float eta -#elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k -#endif -) +ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)( + float3 wi, + float3 *wo, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state, + const float eta, + bool use_fresnel, + const float3 cspec0) { const float2 alpha = make_float2(alpha_x, alpha_y); @@ -186,37 +200,64 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 float G1_r = 0.0f; bool outside = true; + float F0 = fresnel_dielectric_cos(1.0f, eta); + if(use_fresnel) { + throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0); + } + int order; for(order = 0; order < 10; order++) { /* Sample microfacet height. */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) { + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) { /* The random walk has left the surface. */ *wo = outside? wr: -wr; return throughput; } /* Sample microfacet normal. */ - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); /* First-bounce color is already accounted for in mix weight. */ - if(order > 0) + if(!use_fresnel && order > 0) throughput *= color; /* Bounce from the microfacet. */ #ifdef MF_MULTI_GLASS bool next_outside; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float3 wi_prev = -wr; + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { hr = -hr; wr = -wr; outside = !outside; } -#elif defined(MF_MULTI_DIFFUSE) - wr = mf_sample_phase_diffuse(wm, - lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state)); + + if(use_fresnel) { + if(!next_outside) { + throughput *= color; + } + else { + float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0); + + if(order == 0) + throughput = t_color; + else + throughput *= t_color; + } + } #else /* MF_MULTI_GLOSSY */ - wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm); + if(use_fresnel) { + float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); + + if(order == 0) + throughput = t_color; + else + throughput *= t_color; + } + wr = mf_sample_phase_glossy(-wr, &throughput, wm); #endif /* Update random walk parameters. */ @@ -228,6 +269,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 } #undef MF_MULTI_GLASS -#undef MF_MULTI_DIFFUSE #undef MF_MULTI_GLOSSY #undef MF_PHASE_FUNCTION diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h index cb342a026ef..6b770fc0c16 100644 --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h @@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct OrenNayarBsdf { SHADER_CLOSURE_BASE; - float3 N; float roughness; float a; float b; diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h index e152a8780db..420f94755ee 100644 --- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PhongRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float exponent; float3 *colors; } PhongRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h new file mode 100644 index 00000000000..f8ca64293b0 --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h @@ -0,0 +1,127 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__ +#define __BSDF_PRINCIPLED_DIFFUSE_H__ + +/* DISNEY PRINCIPLED DIFFUSE BRDF + * + * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) + */ + +CCL_NAMESPACE_BEGIN + +typedef ccl_addr_space struct PrincipledDiffuseBsdf { + SHADER_CLOSURE_BASE; + + float roughness; +} PrincipledDiffuseBsdf; + +ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf, + float3 N, float3 V, float3 L, float3 H, float *pdf) +{ + float NdotL = max(dot(N, L), 0.0f); + float NdotV = max(dot(N, V), 0.0f); + + if(NdotL < 0 || NdotV < 0) { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } + + float LdotH = dot(L, H); + + float FL = schlick_fresnel(NdotL), FV = schlick_fresnel(NdotV); + const float Fd90 = 0.5f + 2.0f * LdotH*LdotH * bsdf->roughness; + float Fd = (1.0f * (1.0f - FL) + Fd90 * FL) * (1.0f * (1.0f - FV) + Fd90 * FV); + + float value = M_1_PI_F * NdotL * Fd; + + return make_float3(value, value, value); +} + +ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf) +{ + bsdf->type = CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b) +{ + const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a; + const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b; + + return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness); +} + +ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc; + + float3 N = bsdf->N; + float3 V = I; // outgoing + float3 L = omega_in; // incoming + float3 H = normalize(L + V); + + if(dot(N, omega_in) > 0.0f) { + *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F; + return calculate_principled_diffuse_brdf(bsdf, N, V, L, H, pdf); + } + else { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +ccl_device float3 bsdf_principled_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc, + float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, + float3 *eval, float3 *omega_in, float3 *domega_in_dx, + float3 *domega_in_dy, float *pdf) +{ + const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc; + + float3 N = bsdf->N; + + sample_cos_hemisphere(N, randu, randv, omega_in, pdf); + + if(dot(Ng, *omega_in) > 0) { + float3 H = normalize(I + *omega_in); + + *eval = calculate_principled_diffuse_brdf(bsdf, N, I, *omega_in, H, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + // TODO: find a better approximation for the diffuse bounce + *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx); + *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy); +#endif + } + else { + *pdf = 0.0f; + } + return LABEL_REFLECT|LABEL_DIFFUSE; +} + +CCL_NAMESPACE_END + +#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */ + + diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h new file mode 100644 index 00000000000..f4476bfecd0 --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BSDF_PRINCIPLED_SHEEN_H__ +#define __BSDF_PRINCIPLED_SHEEN_H__ + +/* DISNEY PRINCIPLED SHEEN BRDF + * + * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) + */ + +CCL_NAMESPACE_BEGIN + +typedef ccl_addr_space struct PrincipledSheenBsdf { + SHADER_CLOSURE_BASE; +} PrincipledSheenBsdf; + +ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf, + float3 N, float3 V, float3 L, float3 H, float *pdf) +{ + float NdotL = dot(N, L); + float NdotV = dot(N, V); + + if(NdotL < 0 || NdotV < 0) { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } + + float LdotH = dot(L, H); + + float value = schlick_fresnel(LdotH) * NdotL; + + return make_float3(value, value, value); +} + +ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf) +{ + bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc; + + float3 N = bsdf->N; + float3 V = I; // outgoing + float3 L = omega_in; // incoming + float3 H = normalize(L + V); + + if(dot(N, omega_in) > 0.0f) { + *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F; + return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf); + } + else { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +ccl_device float3 bsdf_principled_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc, + float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, + float3 *eval, float3 *omega_in, float3 *domega_in_dx, + float3 *domega_in_dy, float *pdf) +{ + const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc; + + float3 N = bsdf->N; + + sample_cos_hemisphere(N, randu, randv, omega_in, pdf); + + if(dot(Ng, *omega_in) > 0) { + float3 H = normalize(I + *omega_in); + + *eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + // TODO: find a better approximation for the diffuse bounce + *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx); + *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy); +#endif + } + else { + *pdf = 0.0f; + } + return LABEL_REFLECT|LABEL_DIFFUSE; +} + +CCL_NAMESPACE_END + +#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */ + + diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h index 28e775bcbc8..d8b6d8ddead 100644 --- a/intern/cycles/kernel/closure/bsdf_toon.h +++ b/intern/cycles/kernel/closure/bsdf_toon.h @@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct ToonBsdf { SHADER_CLOSURE_BASE; - float3 N; float size; float smooth; } ToonBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h index b0c5280b6cb..3dc15d5791c 100644 --- a/intern/cycles/kernel/closure/bsdf_util.h +++ b/intern/cycles/kernel/closure/bsdf_util.h @@ -124,6 +124,13 @@ ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k return(Rparl2 + Rperp2) * 0.5f; } +ccl_device float schlick_fresnel(float u) +{ + float m = clamp(1.0f - u, 0.0f, 1.0f); + float m2 = m * m; + return m2 * m2 * m; // pow(m, 5) +} + ccl_device float smooth_step(float edge0, float edge1, float x) { float result; @@ -136,6 +143,19 @@ ccl_device float smooth_step(float edge0, float edge1, float x) return result; } +/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */ +ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0) { + /* Calculate the fresnel interpolation factor + * The value from fresnel_dielectric_cos(...) has to be normalized because + * the cspec0 keeps the F0 color + */ + float F0_norm = 1.0f / (1.0f - F0); + float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm; + + /* Blend between white and a specular color with respect to the fresnel */ + return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH; +} + CCL_NAMESPACE_END #endif /* __BSDF_UTIL_H__ */ diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index af0bbd861a9..f733ea4c517 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -27,7 +27,7 @@ typedef ccl_addr_space struct Bssrdf { float d; float texture_blur; float albedo; - float3 N; + float roughness; } Bssrdf; /* Planar Truncated Gaussian @@ -360,10 +360,32 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type) { if(bssrdf->radius < BSSRDF_MIN_RADIUS) { /* revert to diffuse BSDF if radius too small */ - DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf; - bsdf->N = bssrdf->N; - int flag = bsdf_diffuse_setup(bsdf); - bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + int flag; +#ifdef __PRINCIPLED__ + if(type == CLOSURE_BSSRDF_PRINCIPLED_ID) { + float roughness = bssrdf->roughness; + float3 N = bssrdf->N; + float3 weight = bssrdf->weight; + float sample_weight = bssrdf->sample_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bssrdf; + + bsdf->N = N; + bsdf->roughness = roughness; + bsdf->weight = weight; + bsdf->sample_weight = sample_weight; + flag = bsdf_principled_diffuse_setup(bsdf); + bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; + } + else +#endif /* __PRINCIPLED__ */ + { + DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf; + bsdf->N = bssrdf->N; + flag = bsdf_diffuse_setup(bsdf); + bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + } + return flag; } else { @@ -371,7 +393,9 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type) bssrdf->sharpness = saturate(bssrdf->sharpness); bssrdf->type = type; - if(type == CLOSURE_BSSRDF_BURLEY_ID) { + if(type == CLOSURE_BSSRDF_BURLEY_ID || + type == CLOSURE_BSSRDF_PRINCIPLED_ID) + { bssrdf_burley_setup(bssrdf); } @@ -385,7 +409,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float bssrdf_cubic_sample(sc, xi, r, h); else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID) bssrdf_gaussian_sample(sc, xi, r, h); - else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/ + else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/ bssrdf_burley_sample(sc, xi, r, h); } @@ -395,7 +419,7 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r) return bssrdf_cubic_pdf(sc, r); else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID) return bssrdf_gaussian_pdf(sc, r); - else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/ + else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/ return bssrdf_burley_pdf(sc, r); } diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h new file mode 100644 index 00000000000..f6e474d6702 --- /dev/null +++ b/intern/cycles/kernel/filter/filter.h @@ -0,0 +1,52 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_H__ +#define __FILTER_H__ + +/* CPU Filter Kernel Interface */ + +#include "util/util_types.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z +#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) +#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) + +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu.h" + +CCL_NAMESPACE_END + +#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h new file mode 100644 index 00000000000..ce96f733aff --- /dev/null +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_DEFINES_H__ +#define __FILTER_DEFINES_H__ + +#define DENOISE_FEATURES 10 +#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) +#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) +#define XTWY_SIZE (DENOISE_FEATURES+1) + +typedef struct TilesInfo { + int offsets[9]; + int strides[9]; + int x[4]; + int y[4]; + /* TODO(lukas): CUDA doesn't have uint64_t... */ +#ifdef __KERNEL_OPENCL__ + ccl_global float *buffers[9]; +#else + long long int buffers[9]; +#endif +} TilesInfo; + +#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h new file mode 100644 index 00000000000..6226ed2c2ef --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features.h @@ -0,0 +1,124 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + CCL_NAMESPACE_BEGIN + +#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). + * pixel_buffer always points to the current pixel in the first pass. */ +#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { + +#define END_FOR_PIXEL_WINDOW } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } + +ccl_device_inline void filter_get_features(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *features, + const float *ccl_restrict mean, + int pass_stride) +{ + features[0] = pixel.x; + features[1] = pixel.y; + features[2] = fabsf(ccl_get_feature(buffer, 0)); + features[3] = ccl_get_feature(buffer, 1); + features[4] = ccl_get_feature(buffer, 2); + features[5] = ccl_get_feature(buffer, 3); + features[6] = ccl_get_feature(buffer, 4); + features[7] = ccl_get_feature(buffer, 5); + features[8] = ccl_get_feature(buffer, 6); + features[9] = ccl_get_feature(buffer, 7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] -= mean[i]; + } +} + +ccl_device_inline void filter_get_feature_scales(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *scales, + const float *ccl_restrict mean, + int pass_stride) +{ + scales[0] = fabsf(pixel.x - mean[0]); + scales[1] = fabsf(pixel.y - mean[1]); + scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]); + scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], + ccl_get_feature(buffer, 2) - mean[4], + ccl_get_feature(buffer, 3) - mean[5])); + scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); + scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], + ccl_get_feature(buffer, 6) - mean[8], + ccl_get_feature(buffer, 7) - mean[9])); +} + +ccl_device_inline void filter_calculate_scale(float *scale) +{ + scale[0] = 1.0f/max(scale[0], 0.01f); + scale[1] = 1.0f/max(scale[1], 0.01f); + scale[2] = 1.0f/max(scale[2], 0.01f); + scale[6] = 1.0f/max(scale[4], 0.01f); + scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); + scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); +} + +ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer, + int pass_stride) +{ + return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); +} + +ccl_device_inline void design_row_add(float *design_row, + int rank, + const ccl_global float *ccl_restrict transform, + int stride, + int row, + float feature) +{ + for(int i = 0; i < rank; i++) { + design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + } +} + +/* Fill the design row. */ +ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, + const ccl_global float *ccl_restrict p_buffer, + int2 q_pixel, + const ccl_global float *ccl_restrict q_buffer, + int pass_stride, + int rank, + float *design_row, + const ccl_global float *ccl_restrict transform, + int stride) +{ + design_row[0] = 1.0f; + math_vector_zero(design_row+1, rank); + design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); + design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); + design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); + design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h new file mode 100644 index 00000000000..3185330994c --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. + * pixel_buffer always points to the first of the 4 current pixel in the first pass. + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + +#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + __m128 y4 = _mm_set1_ps(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ + __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (pixel.x - low.x); \ + } + +ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *features, + const __m128 *ccl_restrict mean, + int pass_stride) +{ + features[0] = x; + features[1] = y; + features[2] = _mm_fabs_ps(ccl_get_feature_sse(0)); + features[3] = ccl_get_feature_sse(1); + features[4] = ccl_get_feature_sse(2); + features[5] = ccl_get_feature_sse(3); + features[6] = ccl_get_feature_sse(4); + features[7] = ccl_get_feature_sse(5); + features[8] = ccl_get_feature_sse(6); + features[9] = ccl_get_feature_sse(7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_sub_ps(features[i], mean[i]); + } + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_mask_ps(features[i], active_pixels); +} + +ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *scales, + const __m128 *ccl_restrict mean, + int pass_stride) +{ + scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); + scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); + + scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels); + + __m128 diff, scale; + diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[3] = _mm_mask_ps(scale, active_pixels); + + scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); + + diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[5] = _mm_mask_ps(scale, active_pixels); +} + +ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +{ + scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); + scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); + scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); + scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); + + scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); + scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h new file mode 100644 index 00000000000..2ef03dc0a02 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_kernel.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/util_color.h" +#include "util/util_math.h" +#include "util/util_math_fast.h" +#include "util/util_texture.h" + +#include "util/util_atomic.h" +#include "util/util_math_matrix.h" + +#include "kernel/filter/filter_defines.h" + +#include "kernel/filter/filter_features.h" +#ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_features_sse.h" +#endif + +#include "kernel/filter/filter_prefilter.h" + +#ifdef __KERNEL_GPU__ +# include "kernel/filter/filter_transform_gpu.h" +#else +# ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_transform_sse.h" +# else +# include "kernel/filter/filter_transform.h" +# endif +#endif + +#include "kernel/filter/filter_reconstruction.h" + +#ifdef __KERNEL_CPU__ +# include "kernel/filter/filter_nlm_cpu.h" +#else +# include "kernel/filter/filter_nlm_gpu.h" +#endif diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h new file mode 100644 index 00000000000..3e752bce68f --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -0,0 +1,186 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, + const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = variance_image[c*channel_offset + y*w+x]; + float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + difference_image[y*w+x] = diff; + } + } +} + +ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int w, + int f) +{ +#ifdef __KERNEL_SSE3__ + int aligned_lowx = (rect.x & ~(3)); + int aligned_highx = ((rect.z + 3) & ~(3)); +#endif + for(int y = rect.y; y < rect.w; y++) { + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] = 0.0f; + } + for(int y1 = low; y1 < high; y1++) { +#ifdef __KERNEL_SSE3__ + for(int x = aligned_lowx; x < aligned_highx; x+=4) { + _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x))); + } +#else + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] += difference_image[y1*w+x]; + } +#endif + } + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] *= 1.0f/(high - low); + } + } +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int w, + int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] = 0.0f; + } + } + for(int dx = -f; dx <= f; dx++) { + int pos_dx = max(0, dx); + int neg_dx = min(0, dx); + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { + out_image[y*w+x] += difference_image[y*w+dx+x]; + } + } + } + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f)); + } + } +} + +ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, + float *accum_image, + int4 rect, + int w, + int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + accum_image[y*w+x] += weight; + out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)]; + } + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) +{ + /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ + for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) { + int y = fy + filter_rect.y; + for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) { + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + float *l_transform = transform + storage_ofs*TRANSFORM_SIZE; + float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE; + float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE; + int *l_rank = rank + storage_ofs; + + kernel_filter_construct_gramian(x, y, 1, + dx, dy, w, h, + pass_stride, + buffer, + l_transform, l_rank, + weight, l_XtWX, l_XtWY, 0); + } + } +} + +ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, + const float *ccl_restrict accum_image, + int4 rect, + int w) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] /= accum_image[y*w+x]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h new file mode 100644 index 00000000000..2c5ac807051 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -0,0 +1,144 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, + int dx, int dy, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, + int4 rect, int w, + int channel_offset, + float a, float k_2) +{ + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = variance_image[c*channel_offset + y*w+x]; + float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + difference_image[y*w+x] = diff; +} + +ccl_device_inline void kernel_filter_nlm_blur(int x, int y, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int y1 = low; y1 < high; y1++) { + sum += difference_image[y1*w+x]; + } + sum *= 1.0f/(high-low); + out_image[y*w+x] = sum; +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + sum *= 1.0f/(high-low); + out_image[y*w+x] = fast_expf(-max(sum, 0.0f)); +} + +ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, + int dx, int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + sum *= 1.0f/(high-low); + if(out_image) { + accum_image[y*w+x] += sum; + out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)]; + } + else { + accum_image[y*w+x] = sum; + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, + int dx, int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride, + int localIdx) +{ + int y = fy + filter_rect.y; + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + transform += storage_ofs; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + + kernel_filter_construct_gramian(x, y, + filter_rect.z*filter_rect.w, + dx, dy, w, h, + pass_stride, + buffer, + transform, rank, + weight, XtWX, XtWY, + localIdx); +} + +ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, + ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, + int4 rect, int w) +{ + out_image[y*w+x] /= accum_image[y*w+x]; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h new file mode 100644 index 00000000000..a0b89c1111f --- /dev/null +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -0,0 +1,211 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* First step of the shadow prefiltering, performs the shadow division and stores all data + * in a nice and easy rectangular array that can be passed to the NLM filter. + * + * Calculates: + * unfiltered: Contains the two half images of the shadow feature pass + * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated. + * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves) + * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. + */ +ccl_device void kernel_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + int x, int y, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + + int offset = tiles->offsets[tile]; + int stride = tiles->strides[tile]; + const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile]; + center_buffer += (y*stride + x + offset)*buffer_pass_stride; + center_buffer += buffer_denoising_offset + 14; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); + unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); + + float varA = center_buffer[2]; + float varB = center_buffer[5]; + int odd_sample = (sample+1)/2; + int even_sample = sample/2; + if(use_split_variance) { + varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); + varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); + } + varA /= max(odd_sample - 1, 1); + varB /= max(even_sample - 1, 1); + + sampleVariance[idx] = 0.5f*(varA + varB) / sample; + sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); + bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]); +} + +/* Load a regular feature from the render buffers into the denoise buffer. + * Parameters: + * - sample: The sample amount in the buffer, used to normalize the buffer. + * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature. + * - x, y: Current pixel + * - mean, variance: Target denoise buffers. + * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). + */ +ccl_device void kernel_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, int v_offset, + int x, int y, + ccl_global float *mean, + ccl_global float *variance, + int4 rect, int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + mean[idx] = center_buffer[m_offset] / sample; + if (sample > 1) { + if(use_split_variance) { + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + } + else { + variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + } + } + else { + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; + } +} + +ccl_device void kernel_filter_detect_outliers(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *out, + int4 rect, + int pass_stride) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + int n = 0; + float values[25]; + for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { + for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { + int idx = (y1-rect.y)*buffer_w + (x1-rect.x); + float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + + /* Find the position of L. */ + int i; + for(i = 0; i < n; i++) { + if(values[i] > L) break; + } + /* Make space for L by shifting all following values to the right. */ + for(int j = n; j > i; j--) { + values[j] = values[j-1]; + } + /* Insert L. */ + values[i] = L; + n++; + } + } + + int idx = (y-rect.y)*buffer_w + (x-rect.x); + float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride])); + + float ref = 2.0f*values[(int)(n*0.75f)]; + float fac = 1.0f; + if(L > ref) { + /* The pixel appears to be an outlier. + * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel + * should actually be at the reference value: + * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. + * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. + */ + float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); + if(L - 3*stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + fac = ref/L; + variance[idx ] *= fac*fac; + variance[idx + pass_stride] *= fac*fac; + variance[idx+2*pass_stride] *= fac*fac; + } + } + out[idx ] = fac*image[idx]; + out[idx + pass_stride] = fac*image[idx + pass_stride]; + out[idx+2*pass_stride] = fac*image[idx+2*pass_stride]; +} + +/* Combine A/B buffers. + * Calculates the combined mean and the buffer variance. */ +ccl_device void kernel_filter_combine_halves(int x, int y, + ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 rect, int r) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]); + if(variance) { + if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]); + else { + variance[idx] = 0.0f; + float values[25]; + int numValues = 0; + for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) { + for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) { + int pidx = (py-rect.y)*buffer_w + (px-rect.x); + values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]); + } + } + /* Insertion-sort the variances (fast enough for 25 elements). */ + for(int i = 1; i < numValues; i++) { + float v = values[i]; + int j; + for(j = i-1; j >= 0 && values[j] > v; j--) + values[j+1] = values[j]; + values[j+1] = v; + } + variance[idx] = values[(7*numValues)/8]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h new file mode 100644 index 00000000000..25a3025056c --- /dev/null +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -0,0 +1,117 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_construct_gramian(int x, int y, + int storage_stride, + int dx, int dy, + int w, int h, + int pass_stride, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + float weight, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int localIdx) +{ + if(weight < 1e-3f) { + return; + } + + int p_offset = y *w + x; + int q_offset = (y+dy)*w + (x+dx); + +#ifdef __KERNEL_GPU__ + const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); +#else + float design_row[DENOISE_FEATURES+1]; +#endif + + float3 q_color = filter_get_color(buffer + q_offset, pass_stride); + + /* If the pixel was flagged as an outlier during prefiltering, skip it. */ + if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) { + return; + } + + filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, + make_int2(x+dx, y+dy), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride); + + math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); + math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); +} + +ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, + ccl_global float *buffer, + ccl_global int *rank, + int storage_stride, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 buffer_params, + int sample) +{ +#ifdef __KERNEL_GPU__ + const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + + if(XtWX[0] < 1e-3f) { + /* There is not enough information to determine a denoised result. + * As a fallback, keep the original value of the pixel. */ + return; + } + + /* The weighted average of pixel colors (essentially, the NLM-filtered image). + * In case the solution of the linear model fails due to numerical issues, + * fall back to this value. */ + float3 mean_color = XtWY[0]/XtWX[0]; + + math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride); + + float3 final_color = XtWY[0]; + if(!isfinite3_safe(final_color)) { + final_color = mean_color; + } + + /* Clamp pixel value to positive values. */ + final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); + + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + final_color *= sample; + if(buffer_params.w) { + final_color.x += combined_buffer[buffer_params.w+0]; + final_color.y += combined_buffer[buffer_params.w+1]; + final_color.z += combined_buffer[buffer_params.w+2]; + } + combined_buffer[0] = final_color.x; + combined_buffer[1] = final_color.y; + combined_buffer[2] = final_color.z; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h new file mode 100644 index 00000000000..a5f87c05ec0 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -0,0 +1,108 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + float features[DENOISE_FEATURES]; + + /* Temporary storage, used in different steps of the algorithm. */ + float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; + float tempvector[2*DENOISE_FEATURES]; + const float *ccl_restrict pixel_buffer; + int2 pixel; + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float *feature_scale = tempvector; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float* feature_matrix = tempmatrix; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < (*rank); i++) { + math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); + } + math_matrix_transpose(transform, DENOISE_FEATURES, 1); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h new file mode 100644 index 00000000000..83a1222bbdb --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -0,0 +1,119 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + ccl_global float *transform, + ccl_global int *rank, + int radius, float pca_threshold, + int transform_stride, int localIdx) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES; +#else + float features[DENOISE_FEATURES]; +#endif + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + const ccl_global float *ccl_restrict pixel_buffer; + int2 pixel; + + + + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int j = 0; j < (*rank); j++) { + transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h new file mode 100644 index 00000000000..30dc2969b11 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + __m128 features[DENOISE_FEATURES]; + const float *ccl_restrict pixel_buffer; + int2 pixel; + + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + + __m128 feature_means[DENOISE_FEATURES]; + math_vector_zero_sse(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); + math_vector_add_sse(feature_means, DENOISE_FEATURES, features); + } END_FOR_PIXEL_WINDOW_SSE + + __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels); + for(int i = 0; i < DENOISE_FEATURES; i++) { + feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); + } + + __m128 feature_scale[DENOISE_FEATURES]; + math_vector_zero_sse(feature_scale, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW_SSE + + filter_calculate_scale_sse(feature_scale); + + __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f)); + } END_FOR_PIXEL_WINDOW_SSE + + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse); + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, 1); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 8888000f0e6..5c3b0ee3c15 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -565,7 +565,7 @@ ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, I r_ext = mw_extension + r_curr; #ifdef __KERNEL_SSE__ const float3 p_curr_sq = p_curr * p_curr; - const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)); + const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128))); float d = dxxx.x; #else float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 47778553b94..105aee8da15 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 /* Interpolate smooth vertex normal from vertices */ -ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) +ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); - return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + + return is_zero(N)? Ng: N; } /* Ray differentials on triangle */ diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 06c0fb2fbca..84a988f1dbc 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg, #define KERNEL_ARCH cpu #include "kernel/kernels/cpu/kernel_cpu.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu.h" CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 823d30dde78..9ed16aceb55 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -220,8 +220,16 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) #ifdef __SHADOW_TRICKS__ L->path_total = make_float3(0.0f, 0.0f, 0.0f); L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f); - L->shadow_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_radiance_sum = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_throughput = 0.0f; #endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_depth = 0.0f; +#endif /* __DENOISING_FEATURES__ */ } ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, @@ -277,15 +285,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro } ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, float3 alpha, float3 bsdf, - float3 ao, - int bounce) + float3 ao) { #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf*ao; L->ao += alpha*throughput*ao; @@ -302,31 +310,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, } #ifdef __SHADOW_TRICKS__ - float3 light = throughput * bsdf; - L->path_total += light; - L->path_total_shaded += ao * light; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf; + L->path_total += light; + L->path_total_shaded += ao * light; + } #endif } ccl_device_inline void path_radiance_accum_total_ao( PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, float3 bsdf) { #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * bsdf; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf; + } #else (void) L; + (void) state; (void) throughput; (void) bsdf; #endif } -ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp) +ccl_device_inline void path_radiance_accum_light(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + BsdfEval *bsdf_eval, + float3 shadow, + float shadow_fac, + bool is_lamp) { #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow; L->direct_glossy += throughput*bsdf_eval->glossy*shadow; @@ -352,21 +372,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through } #ifdef __SHADOW_TRICKS__ - float3 light = throughput * bsdf_eval->sum_no_mis; - L->path_total += light; - L->path_total_shaded += shadow * light; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf_eval->sum_no_mis; + L->path_total += light; + L->path_total_shaded += shadow * light; + } #endif } ccl_device_inline void path_radiance_accum_total_light( PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, const BsdfEval *bsdf_eval) { #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * bsdf_eval->sum_no_mis; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf_eval->sum_no_mis; + } #else (void) L; + (void) state; (void) throughput; (void) bsdf_eval; #endif @@ -393,11 +419,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, } #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * value; - if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { - L->path_total_shaded += throughput * value; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * value; + if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { + L->path_total_shaded += throughput * value; + } } #endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_albedo += state->denoising_feature_weight * value; +#endif /* __DENOISING_FEATURES__ */ } ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) @@ -555,29 +587,79 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi return L_sum; } +ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean) +{ +#ifdef __PASSES__ + kernel_assert(L->use_light_pass); + + *clean = L->emission + L->background; + *noisy = L->direct_scatter + L->indirect_scatter; + +# define ADD_COMPONENT(flag, component) \ + if(kernel_data.film.denoising_flags & flag) \ + *clean += component; \ + else \ + *noisy += component; + + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface); +# undef ADD_COMPONENT +#else + *noisy = L->emission; + *clean = make_float3(0.0f, 0.0f, 0.0f); +#endif + + *noisy = ensure_finite3(*noisy); + *clean = ensure_finite3(*clean); +} + ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples) { float fac = 1.0f/num_samples; +#ifdef __SPLIT_KERNEL__ +# define safe_float3_add(f, v) \ + do { \ + ccl_global float *p = (ccl_global float*)(&(f)); \ + atomic_add_and_fetch_float(p+0, (v).x); \ + atomic_add_and_fetch_float(p+1, (v).y); \ + atomic_add_and_fetch_float(p+2, (v).z); \ + } while(0) +#else +# define safe_float3_add(f, v) (f) += (v) +#endif /* __SPLIT_KERNEL__ */ + #ifdef __PASSES__ - L->direct_diffuse += L_sample->direct_diffuse*fac; - L->direct_glossy += L_sample->direct_glossy*fac; - L->direct_transmission += L_sample->direct_transmission*fac; - L->direct_subsurface += L_sample->direct_subsurface*fac; - L->direct_scatter += L_sample->direct_scatter*fac; - - L->indirect_diffuse += L_sample->indirect_diffuse*fac; - L->indirect_glossy += L_sample->indirect_glossy*fac; - L->indirect_transmission += L_sample->indirect_transmission*fac; - L->indirect_subsurface += L_sample->indirect_subsurface*fac; - L->indirect_scatter += L_sample->indirect_scatter*fac; - - L->background += L_sample->background*fac; - L->ao += L_sample->ao*fac; - L->shadow += L_sample->shadow*fac; + safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac); + safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac); + safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac); + safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac); + safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac); + + safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac); + safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac); + safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac); + safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac); + safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac); + + safe_float3_add(L->background, L_sample->background*fac); + safe_float3_add(L->ao, L_sample->ao*fac); + safe_float3_add(L->shadow, L_sample->shadow*fac); +# ifdef __SPLIT_KERNEL__ + atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac); +# else L->mist += L_sample->mist*fac; -#endif - L->emission += L_sample->emission * fac; +# endif /* __SPLIT_KERNEL__ */ +#endif /* __PASSES__ */ + safe_float3_add(L->emission, L_sample->emission*fac); + +#undef safe_float3_add } #ifdef __SHADOW_TRICKS__ @@ -595,16 +677,17 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L) /* Calculate final light sum and transparency for shadow catcher object. */ ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg, const PathRadiance *L, - ccl_addr_space float* L_transparent) + float* alpha) { const float shadow = path_radiance_sum_shadow(L); float3 L_sum; if(kernel_data.background.transparent) { - *L_transparent = shadow; - L_sum = make_float3(0.0f, 0.0f, 0.0f); + *alpha = 1.0f - L->shadow_throughput * shadow; + L_sum = L->shadow_radiance_sum; } else { - L_sum = L->shadow_color * shadow; + L_sum = L->shadow_background_color * L->shadow_throughput * shadow + + L->shadow_radiance_sum; } return L_sum; } diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 21da180bb8e..93934ee6b38 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -195,7 +195,7 @@ template<typename T> struct texture_image { if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: ix = wrap_clamp(ix, width); iy = wrap_clamp(iy, height); @@ -222,7 +222,7 @@ template<typename T> struct texture_image { if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: nix = wrap_clamp(ix+1, width); niy = wrap_clamp(iy+1, height); @@ -265,7 +265,7 @@ template<typename T> struct texture_image { if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: pix = wrap_clamp(ix-1, width); piy = wrap_clamp(iy-1, height); @@ -335,7 +335,7 @@ template<typename T> struct texture_image { { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: ix = wrap_clamp(ix, width); iy = wrap_clamp(iy, height); @@ -374,7 +374,7 @@ template<typename T> struct texture_image { { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: nix = wrap_clamp(ix+1, width); niy = wrap_clamp(iy+1, height); @@ -449,7 +449,7 @@ template<typename T> struct texture_image { { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } - /* Fall through. */ + ATTR_FALLTHROUGH; case EXTENSION_EXTEND: pix = wrap_clamp(ix-1, width); piy = wrap_clamp(iy-1, height); diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index c375d17a95f..38708f7ff0b 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -55,6 +55,11 @@ #define ccl_restrict __restrict__ #define ccl_align(n) __align__(n) +#define ATTR_FALLTHROUGH + +#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) + + /* No assert supported for CUDA */ #define kernel_assert(cond) diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index c2263ac0d49..4836c290312 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -50,6 +50,8 @@ # define ccl_addr_space #endif +#define ATTR_FALLTHROUGH + #define ccl_local_id(d) get_local_id(d) #define ccl_global_id(d) get_global_id(d) diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index c9c97ea977e..f95f0d98c52 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -19,6 +19,10 @@ #ifndef __KERNEL_GLOBALS_H__ #define __KERNEL_GLOBALS_H__ +#ifdef __KERNEL_CPU__ +# include "util/util_vector.h" +#endif + CCL_NAMESPACE_BEGIN /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in @@ -38,12 +42,12 @@ struct Intersection; struct VolumeStep; typedef struct KernelGlobals { - texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU]; - texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU]; - texture_image_half4 texture_half4_images[TEX_NUM_HALF4_CPU]; - texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU]; - texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU]; - texture_image_half texture_half_images[TEX_NUM_HALF_CPU]; + vector<texture_image_float4> texture_float4_images; + vector<texture_image_uchar4> texture_byte4_images; + vector<texture_image_half4> texture_half4_images; + vector<texture_image_float> texture_float_images; + vector<texture_image_uchar> texture_byte_images; + vector<texture_image_half> texture_half_images; # define KERNEL_TEX(type, ttype, name) ttype name; # define KERNEL_IMAGE_TEX(type, ttype, name) diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h index 0352c58037d..90747e09357 100644 --- a/intern/cycles/kernel/kernel_image_opencl.h +++ b/intern/cycles/kernel/kernel_image_opencl.h @@ -20,18 +20,19 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) { + const int texture_type = kernel_tex_type(id); /* Float4 */ - if(id < TEX_START_BYTE4_OPENCL) { + if(texture_type == IMAGE_DATA_TYPE_FLOAT4) { return kernel_tex_fetch(__tex_image_float4_packed, offset); } /* Byte4 */ - else if(id < TEX_START_FLOAT_OPENCL) { + else if(texture_type == IMAGE_DATA_TYPE_BYTE4) { uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset); float f = 1.0f/255.0f; return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); } /* Float */ - else if(id < TEX_START_BYTE_OPENCL) { + else if(texture_type == IMAGE_DATA_TYPE_FLOAT) { float f = kernel_tex_fetch(__tex_image_float_packed, offset); return make_float4(f, f, f, 1.0f); } @@ -63,23 +64,34 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix) return x - (float)i; } +ccl_device_inline uint kernel_decode_image_interpolation(uint4 info) +{ + return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; +} + +ccl_device_inline uint kernel_decode_image_extension(uint4 info) +{ + if(info.w & (1 << 1)) { + return EXTENSION_REPEAT; + } + else if(info.w & (1 << 2)) { + return EXTENSION_EXTEND; + } + else { + return EXTENSION_CLIP; + } +} + ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); uint width = info.x; uint height = info.y; uint offset = info.z; - - /* Image Options */ - uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; - uint extension; - if(info.w & (1 << 1)) - extension = EXTENSION_REPEAT; - else if(info.w & (1 << 2)) - extension = EXTENSION_EXTEND; - else - extension = EXTENSION_CLIP; - + /* Decode image options. */ + uint interpolation = kernel_decode_image_interpolation(info); + uint extension = kernel_decode_image_extension(info); + /* Actual sampling. */ float4 r; int ix, iy, nix, niy; if(interpolation == INTERPOLATION_CLOSEST) { @@ -132,7 +144,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width); r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width); } - return r; } @@ -144,17 +155,10 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, uint height = info.y; uint offset = info.z; uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x; - - /* Image Options */ - uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; - uint extension; - if(info.w & (1 << 1)) - extension = EXTENSION_REPEAT; - else if(info.w & (1 << 2)) - extension = EXTENSION_EXTEND; - else - extension = EXTENSION_CLIP; - + /* Decode image options. */ + uint interpolation = kernel_decode_image_interpolation(info); + uint extension = kernel_decode_image_extension(info); + /* Actual sampling. */ float4 r; int ix, iy, iz, nix, niy, niz; if(interpolation == INTERPOLATION_CLOSEST) { @@ -171,7 +175,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, if(extension == EXTENSION_CLIP) { if(x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) - { + { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } } @@ -198,12 +202,13 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, niz = svm_image_texture_wrap_periodic(iz+1, depth); } else { - if(extension == EXTENSION_CLIP) + if(extension == EXTENSION_CLIP) { if(x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } + } /* Fall through. */ /* EXTENSION_EXTEND */ nix = svm_image_texture_wrap_clamp(ix+1, width); @@ -224,8 +229,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height); r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height); r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height); - } - return r; } diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index 67546131746..f5855757d3f 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -175,15 +175,26 @@ ccl_device float cmj_sample_1D(int s, int N, int p) return (x + jx)*invN; } -ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) +/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */ +ccl_device_inline int cmj_isqrt(int value) { - kernel_assert(s < N); - #if defined(__KERNEL_CUDA__) - int m = float_to_int(__fsqrt_ru(N)); + return float_to_int(__fsqrt_ru(value)); +#elif defined(__KERNEL_GPU__) + return float_to_int(sqrtf(value)); #else - int m = float_to_int(sqrtf(N)); + /* This is a work around for fast-math on CPU which might replace sqrtf() + * with am approximated version. + */ + return float_to_int(sqrtf(value) + 1e-6f); #endif +} + +ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) +{ + kernel_assert(s < N); + + int m = cmj_isqrt(N); int n = (N - 1)/m + 1; float invN = 1.0f/N; float invm = 1.0f/m; diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index a2909cec1a1..9baa9d54957 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P, float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); cu = clamp(cu, -1.0f, 1.0f); /* Compute xu. */ - float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); + float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f); xu = clamp(xu, x0, x1); /* Compute yv. */ float z0sq = z0 * z0; diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index ed523696571..9cd7ffb181d 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -60,6 +60,140 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa #endif /* __SPLIT_KERNEL__ */ } +#ifdef __DENOISING_FEATURES__ +ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value) +{ + kernel_write_pass_float(buffer, sample, value); + + /* The online one-pass variance update that's used for the megakernel can't easily be implemented + * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */ +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float(buffer+1, sample, value*value); +# else + if(sample == 0) { + kernel_write_pass_float(buffer+1, sample, 0.0f); + } + else { + float new_mean = buffer[0] * (1.0f / (sample + 1)); + float old_mean = (buffer[0] - value) * (1.0f / sample); + kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean)); + } +# endif +} + +# if defined(__SPLIT_KERNEL__) +# define kernel_write_pass_float3_unaligned kernel_write_pass_float3 +# else +ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value) +{ + buffer[0] = (sample == 0)? value.x: buffer[0] + value.x; + buffer[1] = (sample == 0)? value.y: buffer[1] + value.y; + buffer[2] = (sample == 0)? value.z: buffer[2] + value.z; +} +# endif + +ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value) +{ + kernel_write_pass_float3_unaligned(buffer, sample, value); +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float3_unaligned(buffer+3, sample, value*value); +# else + if(sample == 0) { + kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f)); + } + else { + float3 sum = make_float3(buffer[0], buffer[1], buffer[2]); + float3 new_mean = sum * (1.0f / (sample + 1)); + float3 old_mean = (sum - value) * (1.0f / sample); + kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean)); + } +# endif +} + +ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer, + int sample, float path_total, float path_total_shaded) +{ + if(kernel_data.film.pass_denoising_data == 0) + return; + + buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A; + + path_total = ensure_finite(path_total); + path_total_shaded = ensure_finite(path_total_shaded); + + kernel_write_pass_float(buffer, sample/2, path_total); + kernel_write_pass_float(buffer+1, sample/2, path_total_shaded); + + float value = path_total_shaded / max(path_total, 1e-7f); +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float(buffer+2, sample/2, value*value); +# else + if(sample < 2) { + kernel_write_pass_float(buffer+2, sample/2, 0.0f); + } + else { + float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f); + float new_value = buffer[1] / max(buffer[0], 1e-7f); + kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value)); + } +# endif +} +#endif /* __DENOISING_FEATURES__ */ + +ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + PathRadiance *L) +{ +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight == 0.0f) { + return; + } + + L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length); + + /* Skip implicitly transparent surfaces. */ + if(sd->flag & SD_HAS_ONLY_VOLUME) { + return; + } + + float3 normal = make_float3(0.0f, 0.0f, 0.0f); + float3 albedo = make_float3(0.0f, 0.0f, 0.0f); + float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + continue; + + /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */ + normal += sc->N * sc->sample_weight; + sum_weight += sc->sample_weight; + if(!bsdf_is_specular_like(sc)) { + albedo += sc->weight; + sum_nonspecular_weight += sc->sample_weight; + } + } + + /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */ + if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) { + if(sum_weight != 0.0f) { + normal /= sum_weight; + } + L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal); + L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo); + + state->denoising_feature_weight = 0.0f; + } +#else + (void) kg; + (void) sd; + (void) state; + (void) L; +#endif /* __DENOISING_FEATURES__ */ +} + ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) { @@ -199,5 +333,88 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f #endif } +ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer, + int sample, PathRadiance *L, float alpha, bool is_shadow_catcher) +{ + if(L) { + float3 L_sum; +#ifdef __SHADOW_TRICKS__ + if(is_shadow_catcher) { + L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha); + } + else +#endif /* __SHADOW_TRICKS__ */ + { + L_sum = path_radiance_clamp_and_sum(kg, L); + } + + kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha)); + + kernel_write_light_passes(kg, buffer, L, sample); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { +# ifdef __SHADOW_TRICKS__ + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded)); +# else + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f); +# endif + if(kernel_data.film.pass_denoising_clean) { + float3 noisy, clean; +#ifdef __SHADOW_TRICKS__ + if(is_shadow_catcher) { + noisy = L_sum; + clean = make_float3(0.0f, 0.0f, 0.0f); + } + else +#endif /* __SHADOW_TRICKS__ */ + { + path_radiance_split_denoising(kg, L, &noisy, &clean); + } + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, noisy); + kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, + sample, clean); + } + else { + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, ensure_finite3(L_sum)); + } + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + sample, L->denoising_normal); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + sample, L->denoising_albedo); + kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + sample, L->denoising_depth); + } +#endif /* __DENOISING_FEATURES__ */ + } + else { + kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f)); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f); + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, make_float3(0.0f, 0.0f, 0.0f)); + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + sample, make_float3(0.0f, 0.0f, 0.0f)); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + sample, make_float3(0.0f, 0.0f, 0.0f)); + kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + sample, 0.0f); + + if(kernel_data.film.pass_denoising_clean) { + kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, + sample, make_float3(0.0f, 0.0f, 0.0f)); + } + } +#endif /* __DENOISING_FEATURES__ */ + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index e7957042182..c340b3bc968 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -58,7 +58,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, + ccl_addr_space PathState *state, RNG *rng, float3 throughput, float3 ao_alpha) @@ -90,14 +90,16 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow); } else { - path_radiance_accum_total_ao(L, throughput, ao_bsdf); + path_radiance_accum_total_ao(L, state, throughput, ao_bsdf); } } } +#ifndef __SPLIT_KERNEL__ + ccl_device void kernel_path_indirect(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, @@ -364,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, sd, state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { @@ -403,7 +407,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -#if defined(__EMISSION__) && defined(__BRANCHED_PATH__) +#if defined(__EMISSION__) if(kernel_data.integrator.use_direct_light) { int all = (kernel_data.integrator.sample_all_lights_indirect) || (state->flag & PATH_RAY_SHADOW_CATCHER); @@ -417,7 +421,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, L, all); } -#endif /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */ +#endif /* defined(__EMISSION__) */ if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray)) break; @@ -425,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } -ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer) +ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, + RNG *rng, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; @@ -515,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, float3 emission; if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission)) - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __LAMP_MIS__ */ @@ -547,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* emission */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); /* scattering */ VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; @@ -557,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* direct light sampling */ kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, + &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); /* indirect sample. if we use distance sampling and take just @@ -575,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, kernel_volume_decoupled_free(kg, &volume_segment); if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -589,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous); + kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -621,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, &state, throughput, L_background); + path_radiance_accum_background(L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -638,11 +643,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state.flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_SHADOW_CATCHER_ONLY | + PATH_RAY_STORE_SHADOW_INFO); state.catcher_object = sd.object; if(!kernel_data.background.transparent) { - L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); + L->shadow_background_color = + indirect_background(kg, &emission_sd, &state, &ray); } + L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); + L->shadow_throughput = average(throughput); } } else { @@ -675,7 +685,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __HOLDOUT__ */ /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); /* blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy) */ @@ -693,7 +703,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(sd.flag & SD_EMISSION) { /* todo: is isect.t wrong here for transparent surfaces? */ float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __EMISSION__ */ @@ -713,10 +723,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); + kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); } #endif /* __AO__ */ @@ -727,7 +739,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(kernel_path_subsurface_scatter(kg, &sd, &emission_sd, - &L, + L, &state, rng, &ray, @@ -740,15 +752,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) break; } #ifdef __SUBSURFACE__ - kernel_path_subsurface_accum_indirect(&ss_indirect, &L); + kernel_path_subsurface_accum_indirect(&ss_indirect, L); /* Trace indirect subsurface rays by restarting the loop. this uses less * stack memory than invoking kernel_path_indirect. @@ -758,7 +770,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, &ss_indirect, &state, &ray, - &L, + L, &throughput); } else { @@ -767,24 +779,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ - float3 L_sum; #ifdef __SHADOW_TRICKS__ - if(state.flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); - } - else + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); #endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, &L); - } - - kernel_write_light_passes(kg, buffer, &L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); #endif /* __KERNEL_DEBUG__ */ - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); + return 1.0f - L_transparent; } ccl_device void kernel_path_trace(KernelGlobals *kg, @@ -805,18 +808,21 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + PathRadiance L; + bool is_shadow_catcher; - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + if(ray.t != 0.0f) { + float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + } + else { + kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + } path_rng_end(kg, rng_state, rng); } +#endif /* __SPLIT_KERNEL__ */ + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index 36fd6c95fe7..77d4f1df447 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -22,7 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, + ccl_addr_space PathState *state, RNG *rng, float3 throughput) { @@ -56,29 +56,48 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow); } else { - path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf); + path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf); } } } } +#ifndef __SPLIT_KERNEL__ /* bounce off surface and integrate indirect light */ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + for(int i = 0; i < sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; - if(!CLOSURE_IS_BSDF(sc->type)) - continue; /* transparency is not handled here, but in outer loop */ - if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { continue; + } int num_samples; @@ -110,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba &tp, &ps, L, - &bsdf_ray)) + &bsdf_ray, + sum_sample_weight)) { continue; } @@ -242,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) +ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, + RNG *rng, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; @@ -329,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in int all = kernel_data.integrator.sample_all_lights_direct; kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, + &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); /* indirect light sampling */ @@ -337,11 +362,6 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in float num_samples_inv = 1.0f/num_samples; for(int j = 0; j < num_samples; j++) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state.rng_offset); - PathState ps = state; Ray pray = ray; float3 tp = throughput; @@ -352,8 +372,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* scatter sample. if we use distance sampling and take just one * sample for direct and indirect light, we could share this * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, &ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false); @@ -366,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in &sd, &tp, &ps, - &L, + L, &pray)) { kernel_path_indirect(kg, @@ -377,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in tp*num_samples_inv, num_samples, &ps, - &L); + L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); } } } /* emission and transmittance */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); throughput *= volume_segment.accum_transmittance; /* free cached steps */ @@ -411,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in path_state_branch(&ps, j, num_samples); VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous); + kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous); #ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* todo: support equiangular, MIS and all light sampling. * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L); + kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L); if(kernel_path_volume_bounce(kg, rng, &sd, &tp, &ps, - &L, + L, &pray)) { kernel_path_indirect(kg, @@ -435,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in tp, num_samples, &ps, - &L); + L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); } } #endif /* __VOLUME_SCATTER__ */ @@ -466,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, &state, throughput, L_background); + path_radiance_accum_background(L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -479,13 +499,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { - if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); - state.catcher_object = sd.object; - if(!kernel_data.background.transparent) { - L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); - } + state.flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_SHADOW_CATCHER_ONLY | + PATH_RAY_STORE_SHADOW_INFO); + state.catcher_object = sd.object; + if(!kernel_data.background.transparent) { + L->shadow_background_color = + indirect_background(kg, &emission_sd, &state, &ray); } + L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); + L->shadow_throughput = average(throughput); } else { state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY; @@ -513,13 +536,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* __HOLDOUT__ */ /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); #ifdef __EMISSION__ /* emission */ if(sd.flag & SD_EMISSION) { float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __EMISSION__ */ @@ -543,10 +566,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in } } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput); + kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput); } #endif /* __AO__ */ @@ -554,7 +579,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, - &L, &state, rng, &ray, throughput); + L, &state, rng, &ray, throughput); } #endif /* __SUBSURFACE__ */ @@ -567,13 +592,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in int all = (kernel_data.integrator.sample_all_lights_direct) || (state.flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, rng, - &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all); + &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all); } #endif /* __EMISSION__ */ /* indirect light */ kernel_branched_path_surface_indirect_light(kg, rng, - &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L); + &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L); /* continue in case of transparency */ throughput *= shader_bsdf_transparency(kg, &sd); @@ -602,24 +627,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* __VOLUME__ */ } - float3 L_sum; #ifdef __SHADOW_TRICKS__ - if(state.flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); - } - else + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); #endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, &L); - } - - kernel_write_light_passes(kg, buffer, &L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); #endif /* __KERNEL_DEBUG__ */ - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); + return 1.0f - L_transparent; } ccl_device void kernel_branched_path_trace(KernelGlobals *kg, @@ -640,20 +656,22 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + PathRadiance L; + bool is_shadow_catcher; - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + if(ray.t != 0.0f) { + float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + } + else { + kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + } path_rng_end(kg, rng_state, rng); } +#endif /* __SPLIT_KERNEL__ */ + #endif /* __BRANCHED_PATH__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index c0cd2a63120..5d92fd12201 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, state->transmission_bounce = 0; state->transparent_bounce = 0; +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + state->flag |= PATH_RAY_STORE_SHADOW_INFO; + state->denoising_feature_weight = 1.0f; + } + else { + state->denoising_feature_weight = 0.0f; + } +#endif /* __DENOISING_FEATURES__ */ + state->min_ray_pdf = FLT_MAX; state->ray_pdf = 0.0f; #ifdef __LAMP_MIS__ @@ -128,6 +138,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta /* random number generator next bounce */ state->rng_offset += PRNG_BOUNCE_NUM; + +#ifdef __DENOISING_FEATURES__ + if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) { + state->flag &= ~PATH_RAY_STORE_SHADOW_INFO; + } +#endif } ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state) diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index 076c82f3853..dcb577e176f 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) +#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__) /* branched path tracing: connect path directly to position on one or more lights and add it to L */ ccl_device_noinline void kernel_branched_path_surface_connect_light( KernelGlobals *kg, @@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light); } } } @@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce( ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, - Ray *ray) + ccl_addr_space Ray *ray, + float sum_sample_weight) { /* sample BSDF */ float bsdf_pdf; @@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce( /* modify throughput */ path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); +#ifdef __DENOISING_FEATURES__ + state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples); +#endif + /* modify path state */ path_state_next(kg, state, label); @@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } else { - path_radiance_accum_total_light(L, throughput, &L_light); + path_radiance_accum_total_light(L, state, throughput, &L_light); } } } diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index 371f2c1c7cb..dcedf51e479 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } } } @@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp); } } } diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index 9a2b0884a7e..cbb2442d1dc 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi) ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range) { + if(is_zero(dir)) + return make_float2(0.0f, 0.0f); + float u = (atan2f(dir.y, dir.x) - range.y) / range.x; float v = (acosf(dir.z / len(dir)) - range.w) / range.z; diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h index 96bc636d5ac..e32d4bbbc1b 100644 --- a/intern/cycles/kernel/kernel_queues.h +++ b/intern/cycles/kernel/kernel_queues.h @@ -128,6 +128,21 @@ ccl_device unsigned int get_global_queue_index( return my_gqidx; } +ccl_device int dequeue_ray_index( + int queue_number, + ccl_global int *queues, + int queue_size, + ccl_global int *queue_index) +{ + int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1; + + if(index < 0) { + return QUEUE_EMPTY_SLOT; + } + + return queues[index + queue_number * queue_size]; +} + CCL_NAMESPACE_END #endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index d4f0caff5de..e8a912ccc0b 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -20,14 +20,15 @@ CCL_NAMESPACE_BEGIN #ifdef __SOBOL__ -/* skip initial numbers that are not as well distributed, especially the +/* Skip initial numbers that are not as well distributed, especially the * first sequence is just 0 everywhere, which can be problematic for e.g. - * path termination */ + * path termination. + */ #define SOBOL_SKIP 64 -/* High Dimensional Sobol */ +/* High Dimensional Sobol. */ -/* van der corput radical inverse */ +/* Van der Corput radical inverse. */ ccl_device uint van_der_corput(uint bits) { bits = (bits << 16) | (bits >> 16); @@ -38,58 +39,63 @@ ccl_device uint van_der_corput(uint bits) return bits; } -/* sobol radical inverse */ +/* Sobol radical inverse. */ ccl_device uint sobol(uint i) { uint r = 0; - - for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) - if(i & 1) + for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) { + if(i & 1) { r ^= v; - + } + } return r; } -/* inverse of sobol radical inverse */ +/* Inverse of sobol radical inverse. */ ccl_device uint sobol_inverse(uint i) { const uint msb = 1U << 31; uint r = 0; - - for(uint v = 1; i; i <<= 1, v ^= v << 1) - if(i & msb) + for(uint v = 1; i; i <<= 1, v ^= v << 1) { + if(i & msb) { r ^= v; - + } + } return r; } -/* multidimensional sobol with generator matrices - * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */ +/* Multidimensional sobol with generator matrices + * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively. + */ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) { uint result = 0; uint i = index; - - for(uint j = 0; i; i >>= 1, j++) - if(i & 1) + for(uint j = 0; i; i >>= 1, j++) { + if(i & 1) { result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j); - + } + } return result; } -/* lookup index and x/y coordinate, assumes m is a power of two */ -ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y) +/* Lookup index and x/y coordinate, assumes m is a power of two. */ +ccl_device uint sobol_lookup(const uint m, + const uint frame, + const uint ex, + const uint ey, + uint *x, uint *y) { - /* shift is constant per frame */ + /* Shift is constant per frame. */ const uint shift = frame << (m << 1); const uint sobol_shift = sobol(shift); - /* van der Corput is its own inverse */ + /* Van der Corput is its own inverse. */ const uint lower = van_der_corput(ex << (32 - m)); - /* need to compensate for ey difference and shift */ + /* Need to compensate for ey difference and shift. */ const uint sobol_lower = sobol(lower); - const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */ + const uint mask = ~-(1 << m) << (32 - m); /* Only m upper bits. */ const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask; - /* only use m upper bits for the index (m is a power of two) */ + /* Only use m upper bits for the index (m is a power of two). */ const uint sobol_result = delta | (delta >> m); const uint upper = sobol_inverse(sobol_result); const uint index = shift | upper | lower; @@ -98,11 +104,14 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons return index; } -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, + RNG *rng, + int sample, int num_samples, + int dimension) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { - /* correlated multi-jittered */ + /* Correlated multi-jitter. */ int p = *rng + dimension; return cmj_sample_1D(sample, num_samples, p); } @@ -113,7 +122,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample float r = (float)result * (1.0f/(float)0xFFFFFFFF); return r; #else - /* compute sobol sequence value using direction vectors */ + /* Compute sobol sequence value using direction vectors. */ uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension); float r = (float)result * (1.0f/(float)0xFFFFFFFF); @@ -130,24 +139,33 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample #endif } -ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, + RNG *rng, + int sample, int num_samples, + int dimension, + float *fx, float *fy) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { - /* correlated multi-jittered */ + /* Correlated multi-jitter. */ int p = *rng + dimension; cmj_sample_2D(sample, num_samples, p, fx, fy); } else #endif { - /* sobol */ + /* Sobol. */ *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); } } -ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) +ccl_device_inline void path_rng_init(KernelGlobals *kg, + ccl_global uint *rng_state, + int sample, int num_samples, + RNG *rng, + int x, int y, + float *fx, float *fy) { #ifdef __SOBOL_FULL_SCREEN__ uint px, py; @@ -182,29 +200,43 @@ ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_sta #endif } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) +ccl_device void path_rng_end(KernelGlobals *kg, + ccl_global uint *rng_state, + RNG rng) { /* nothing to do */ } -#else +#else /* __SOBOL__ */ /* Linear Congruential Generator */ -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, + RNG *rng, + int sample, int num_samples, + int dimension) { /* implicit mod 2^32 */ *rng = (1103515245*(*rng) + 12345); return (float)*rng * (1.0f/(float)0xFFFFFFFF); } -ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_inline void path_rng_2D(KernelGlobals *kg, + RNG *rng, + int sample, int num_samples, + int dimension, + float *fx, float *fy) { *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); } -ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) +ccl_device void path_rng_init(KernelGlobals *kg, + ccl_global uint *rng_state, + int sample, int num_samples, + RNG *rng, + int x, int y, + float *fx, float *fy) { /* load state */ *rng = *rng_state; @@ -220,13 +252,15 @@ ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int } } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) +ccl_device void path_rng_end(KernelGlobals *kg, + ccl_global uint *rng_state, + RNG rng) { /* store state for next sample */ *rng_state = rng; } -#endif +#endif /* __SOBOL__ */ /* Linear Congruential Generator */ @@ -257,49 +291,108 @@ ccl_device uint lcg_init(uint seed) * dimension to avoid using the same sequence twice. * * For branches in the path we must be careful not to reuse the same number - * in a sequence and offset accordingly. */ + * in a sequence and offset accordingly. + */ -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int dimension) { - return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension); + return path_rng_1D(kg, + rng, + state->sample, state->num_samples, + state->rng_offset + dimension); } -ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D_for_decision( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int dimension) { - /* the rng_offset is not increased for transparent bounces. if we do then + /* The rng_offset is not increased for transparent bounces. if we do then * fully transparent objects can become subtly visible by the different * sampling patterns used where the transparent object is. * * however for some random numbers that will determine if we next bounce * is transparent we do need to increase the offset to avoid always making - * the same decision */ - int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; - return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension); + * the same decision. */ + const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; + return path_rng_1D(kg, + rng, + state->sample, state->num_samples, + rng_offset + dimension); } -ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) +ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int dimension, + float *fx, float *fy) { - path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); + path_rng_2D(kg, + rng, + state->sample, state->num_samples, + state->rng_offset + dimension, + fx, fy); } -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension) { - return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension); + return path_rng_1D(kg, + rng, + state->sample * num_branches + branch, + state->num_samples * num_branches, + state->rng_offset + dimension); } -ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D_for_decision( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension) { - int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; - return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension); + const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; + return path_rng_1D(kg, + rng, + state->sample * num_branches + branch, + state->num_samples * num_branches, + rng_offset + dimension); } -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) +ccl_device_inline void path_branched_rng_2D( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension, + float *fx, float *fy) { - path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy); + path_rng_2D(kg, + rng, + state->sample * num_branches + branch, + state->num_samples * num_branches, + state->rng_offset + dimension, + fx, fy); } -/* Utitility functions to get light termination value, since it might not be needed in many cases. */ -ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state) +/* Utitility functions to get light termination value, + * since it might not be needed in many cases. + */ +ccl_device_inline float path_state_rng_light_termination( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE); @@ -307,15 +400,27 @@ ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG return 0.0f; } -ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches) +ccl_device_inline float path_branched_rng_light_termination( + KernelGlobals *kg, + RNG *rng, + const ccl_addr_space PathState *state, + int branch, + int num_branches) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE); + return path_branched_rng_1D_for_decision(kg, + rng, + state, + branch, + num_branches, + PRNG_LIGHT_TERMINATE); } return 0.0f; } -ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int branch, int num_branches) +ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, + int branch, + int num_branches) { /* path is splitting into a branch, adjust so that each branch * still gets a unique sample from the same sequence */ @@ -324,14 +429,17 @@ ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int br state->num_samples = state->num_samples*num_branches; } -ccl_device_inline uint lcg_state_init(RNG *rng, int rng_offset, int sample, uint scramble) +ccl_device_inline uint lcg_state_init(RNG *rng, + int rng_offset, + int sample, + uint scramble) { return lcg_init(*rng + rng_offset + sample*scramble); } ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng) { - /* implicit mod 2^32 */ + /* Implicit mod 2^32 */ *rng = (1103515245*(*rng) + 12345); return (float)*rng * (1.0f/(float)0xFFFFFFFF); } diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 8c0c5e90a3e..c66f52255f0 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, /* smooth normal */ if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __DPDU__ /* dPdu/dPdv */ @@ -186,7 +186,7 @@ void shader_setup_from_subsurface( sd->N = Ng; if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); # ifdef __DPDU__ /* dPdu/dPdv */ @@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, if(sd->type & PRIMITIVE_TRIANGLE) { /* smooth normal */ if(sd->shader & SHADER_SMOOTH_NORMAL) { - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __INSTANCING__ if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index db6f839d9ed..fab5946970d 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -379,7 +379,7 @@ ccl_device bool shadow_blocked_transparent_stepped( float3 *shadow) { bool blocked, is_transparent_isect; - if (skip_object == OBJECT_NONE) { + if(skip_object == OBJECT_NONE) { blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index f75e9337bdb..6475d4b66fd 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -140,7 +140,7 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, } /* replace closures with a single diffuse bsdf closure after scatter step */ -ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N) +ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N) { sd->flag &= ~SD_CLOSURE_FLAGS; sd->randb_closure = 0.0f; @@ -148,15 +148,35 @@ ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 wei sd->num_closure_extra = 0; if(hit) { - DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); - - if(bsdf) { - bsdf->N = N; - sd->flag |= bsdf_diffuse_setup(bsdf); - - /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes - * can recognize it as not being a regular diffuse closure */ - bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + Bssrdf *bssrdf = (Bssrdf *)sc; +#ifdef __PRINCIPLED__ + if(bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) { + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = bssrdf->roughness; + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + + /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes + * can recognize it as not being a regular Disney principled diffuse closure */ + bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; + } + } + else if(CLOSURE_IS_BSDF_BSSRDF(bssrdf->type) || + CLOSURE_IS_BSSRDF(bssrdf->type)) +#endif /* __PRINCIPLED__ */ + { + DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); + + if(bsdf) { + bsdf->N = N; + sd->flag |= bsdf_diffuse_setup(bsdf); + + /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes + * can recognize it as not being a regular diffuse closure */ + bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + } } } } @@ -379,6 +399,12 @@ ccl_device_noinline void subsurface_scatter_multi_setup( #else Ray *ray = &ss_isect->ray; #endif + + /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */ +#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__) + kernel_split_params.dummy_sd_flag = sd->flag; +#endif + /* Setup new shading point. */ shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray); @@ -388,12 +414,11 @@ ccl_device_noinline void subsurface_scatter_multi_setup( subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N); /* Setup diffuse BSDF. */ - subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N); + subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N); } -#ifndef __SPLIT_KERNEL__ /* subsurface scattering step, from a point on the surface to another nearby point on the same object */ -ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state, +ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); @@ -454,6 +479,10 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS if(ss_isect.num_hits > 0) { float3 origP = sd->P; + /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */ +#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__) + kernel_split_params.dummy_sd_flag = sd->flag; +#endif /* setup new shading point */ shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray); @@ -479,9 +508,8 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N); /* setup diffuse bsdf */ - subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N); + subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N); } -#endif /* ! __SPLIT_KERNEL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index cb1a3f40dee..aa5b32803a5 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -82,10 +82,10 @@ KERNEL_TEX(uint, texture_uint, __sobol_directions) # if __CUDA_ARCH__ < 300 /* full-float image */ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024) +KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001) @@ -93,91 +93,93 @@ KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003) KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004) -/* image */ -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008) +/* image + * These texture names are encoded to their flattened slots as + * ImageManager::type_index_to_flattened_slot() returns them. */ +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665) # else /* bindless textures */ diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 623f3728c69..e6a62c42a38 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN # define WORK_POOL_SIZE WORK_POOL_SIZE_CPU #endif + +#define SHADER_SORT_BLOCK_SIZE 2048 + +#ifdef __KERNEL_OPENCL__ +# define SHADER_SORT_LOCAL_SIZE 64 +#elif defined(__KERNEL_CUDA__) +# define SHADER_SORT_LOCAL_SIZE 32 +#else +# define SHADER_SORT_LOCAL_SIZE 1 +#endif + + /* device capabilities */ #ifdef __KERNEL_CPU__ # ifdef __KERNEL_SSE2__ @@ -71,21 +83,18 @@ CCL_NAMESPACE_BEGIN # endif # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# ifndef __SPLIT_KERNEL__ -# define __BRANCHED_PATH__ -# endif +# define __BRANCHED_PATH__ # ifdef WITH_OSL # define __OSL__ # endif +# define __PRINCIPLED__ # define __SUBSURFACE__ # define __CMJ__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ -# ifndef __SPLIT_KERNEL__ -# define __VOLUME_DECOUPLED__ -# define __VOLUME_RECORD_ALL__ -# endif +# define __VOLUME_DECOUPLED__ +# define __VOLUME_RECORD_ALL__ #endif /* __KERNEL_CPU__ */ #ifdef __KERNEL_CUDA__ @@ -94,10 +103,11 @@ CCL_NAMESPACE_BEGIN # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SUBSURFACE__ +# define __PRINCIPLED__ # define __SHADOW_RECORD_ALL__ +# define __CMJ__ # ifndef __SPLIT_KERNEL__ # define __BRANCHED_PATH__ -# define __CMJ__ # endif #endif /* __KERNEL_CUDA__ */ @@ -109,43 +119,44 @@ CCL_NAMESPACE_BEGIN # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ # define __SUBSURFACE__ +# define __PRINCIPLED__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif +# define __CMJ__ +# define __BRANCHED_PATH__ # endif /* __KERNEL_OPENCL_NVIDIA__ */ # ifdef __KERNEL_OPENCL_APPLE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __PRINCIPLED__ +# define __CMJ__ /* TODO(sergey): Currently experimental section is ignored here, * this is because megakernel in device_opencl does not support * custom cflags depending on the scene features. */ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif -# endif /* __KERNEL_OPENCL_NVIDIA__ */ +# endif /* __KERNEL_OPENCL_APPLE__ */ # ifdef __KERNEL_OPENCL_AMD__ # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ # define __SUBSURFACE__ +# define __PRINCIPLED__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ +# define __CMJ__ +# define __BRANCHED_PATH__ # endif /* __KERNEL_OPENCL_AMD__ */ # ifdef __KERNEL_OPENCL_INTEL_CPU__ # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif +# define __PRINCIPLED__ +# define __CMJ__ # endif /* __KERNEL_OPENCL_INTEL_CPU__ */ #endif /* __KERNEL_OPENCL__ */ @@ -165,6 +176,8 @@ CCL_NAMESPACE_BEGIN #define __PATCH_EVAL__ #define __SHADOW_TRICKS__ +#define __DENOISING_FEATURES__ + #ifdef __KERNEL_SHADING__ # define __SVM__ # define __EMISSION__ @@ -220,7 +233,13 @@ CCL_NAMESPACE_BEGIN # undef __TRANSPARENT_SHADOWS__ #endif #ifdef __NO_SHADOW_TRICKS__ -#undef __SHADOW_TRICKS__ +# undef __SHADOW_TRICKS__ +#endif +#ifdef __NO_PRINCIPLED__ +# undef __PRINCIPLED__ +#endif +#ifdef __NO_DENOISING__ +# undef __DENOISING_FEATURES__ #endif /* Random Numbers */ @@ -303,31 +322,32 @@ enum SamplingPattern { /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */ enum PathRayFlag { - PATH_RAY_CAMERA = 1, - PATH_RAY_REFLECT = 2, - PATH_RAY_TRANSMIT = 4, - PATH_RAY_DIFFUSE = 8, - PATH_RAY_GLOSSY = 16, - PATH_RAY_SINGULAR = 32, - PATH_RAY_TRANSPARENT = 64, - - PATH_RAY_SHADOW_OPAQUE = 128, - PATH_RAY_SHADOW_TRANSPARENT = 256, + PATH_RAY_CAMERA = (1 << 0), + PATH_RAY_REFLECT = (1 << 1), + PATH_RAY_TRANSMIT = (1 << 2), + PATH_RAY_DIFFUSE = (1 << 3), + PATH_RAY_GLOSSY = (1 << 4), + PATH_RAY_SINGULAR = (1 << 5), + PATH_RAY_TRANSPARENT = (1 << 6), + + PATH_RAY_SHADOW_OPAQUE = (1 << 7), + PATH_RAY_SHADOW_TRANSPARENT = (1 << 8), PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ - PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ + PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */ /* Special flag to tag unaligned BVH nodes. */ - PATH_RAY_NODE_UNALIGNED = 2048, + PATH_RAY_NODE_UNALIGNED = (1 << 11), - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048), + PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1), - PATH_RAY_MIS_SKIP = 4096, - PATH_RAY_DIFFUSE_ANCESTOR = 8192, - PATH_RAY_SINGLE_PASS_DONE = 16384, - PATH_RAY_SHADOW_CATCHER = 32768, - PATH_RAY_SHADOW_CATCHER_ONLY = 65536, + PATH_RAY_MIS_SKIP = (1 << 12), + PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13), + PATH_RAY_SINGLE_PASS_DONE = (1 << 14), + PATH_RAY_SHADOW_CATCHER = (1 << 15), + PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16), + PATH_RAY_STORE_SHADOW_INFO = (1 << 17), }; /* Closure Label */ @@ -383,6 +403,22 @@ typedef enum PassType { #define PASS_ALL (~0) +typedef enum DenoisingPassOffsets { + DENOISING_PASS_NORMAL = 0, + DENOISING_PASS_NORMAL_VAR = 3, + DENOISING_PASS_ALBEDO = 6, + DENOISING_PASS_ALBEDO_VAR = 9, + DENOISING_PASS_DEPTH = 12, + DENOISING_PASS_DEPTH_VAR = 13, + DENOISING_PASS_SHADOW_A = 14, + DENOISING_PASS_SHADOW_B = 17, + DENOISING_PASS_COLOR = 20, + DENOISING_PASS_COLOR_VAR = 23, + + DENOISING_PASS_SIZE_BASE = 26, + DENOISING_PASS_SIZE_CLEAN = 3, +} DenoisingPassOffsets; + typedef enum BakePassFilter { BAKE_FILTER_NONE = 0, BAKE_FILTER_DIRECT = (1 << 0), @@ -416,6 +452,18 @@ typedef enum BakePassFilterCombos { BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE), } BakePassFilterCombos; +typedef enum DenoiseFlag { + DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0), + DENOISING_CLEAN_DIFFUSE_IND = (1 << 1), + DENOISING_CLEAN_GLOSSY_DIR = (1 << 2), + DENOISING_CLEAN_GLOSSY_IND = (1 << 3), + DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4), + DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5), + DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6), + DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7), + DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1, +} DenoiseFlag; + typedef ccl_addr_space struct PathRadiance { #ifdef __PASSES__ int use_light_pass; @@ -469,8 +517,20 @@ typedef ccl_addr_space struct PathRadiance { float3 path_total_shaded; /* Color of the background on which shadow is alpha-overed. */ - float3 shadow_color; + float3 shadow_background_color; + + /* Path radiance sum and throughput at the moment when ray hits shadow + * catcher object. + */ + float3 shadow_radiance_sum; + float shadow_throughput; #endif + +#ifdef __DENOISING_FEATURES__ + float3 denoising_normal; + float3 denoising_albedo; + float denoising_depth; +#endif /* __DENOISING_FEATURES__ */ } PathRadiance; typedef struct BsdfEval { @@ -713,12 +773,13 @@ typedef struct AttributeDescriptor { #define SHADER_CLOSURE_BASE \ float3 weight; \ ClosureType type; \ - float sample_weight \ + float sample_weight; \ + float3 N typedef ccl_addr_space struct ccl_align(16) ShaderClosure { SHADER_CLOSURE_BASE; - float data[14]; /* pad to 80 bytes */ + float data[10]; /* pad to 80 bytes */ } ShaderClosure; /* Shader Context @@ -949,6 +1010,10 @@ typedef struct PathState { int transmission_bounce; int transparent_bounce; +#ifdef __DENOISING_FEATURES__ + float denoising_feature_weight; +#endif /* __DENOISING_FEATURES__ */ + /* multiple importance sampling */ float min_ray_pdf; /* smallest bounce pdf over entire path up to now */ float ray_pdf; /* last bounce pdf */ @@ -1126,6 +1191,11 @@ typedef struct KernelFilm { float mist_inv_depth; float mist_falloff; + int pass_denoising_data; + int pass_denoising_clean; + int denoising_flags; + int pad; + #ifdef __KERNEL_DEBUG__ int pass_bvh_traversed_nodes; int pass_bvh_traversed_instances; @@ -1298,7 +1368,6 @@ typedef ccl_addr_space struct DebugData { * Queue 3 - Shadow ray cast kernel - AO * Queeu 4 - Shadow ray cast kernel - direct lighting */ -#define NUM_QUEUES 4 /* Queue names */ enum QueueNumber { @@ -1311,22 +1380,42 @@ enum QueueNumber { * 3. Rays to be regenerated * are enqueued here. */ - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contribution for AO are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contributing for direct lighting are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + + /* Rays sorted according to shader->id */ + QUEUE_SHADER_SORTED_RAYS, + +#ifdef __BRANCHED_PATH__ + /* All rays moving to next iteration of the indirect loop for light */ + QUEUE_LIGHT_INDIRECT_ITER, + /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */ + QUEUE_INACTIVE_RAYS, +# ifdef __VOLUME__ + /* All rays moving to next iteration of the indirect loop for volumes */ + QUEUE_VOLUME_INDIRECT_ITER, +# endif +# ifdef __SUBSURFACE__ + /* All rays moving to next iteration of the indirect loop for subsurface */ + QUEUE_SUBSURFACE_INDIRECT_ITER, +# endif +#endif /* __BRANCHED_PATH__ */ + + NUM_QUEUES }; -/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */ -#define RAY_STATE_MASK 0x007 -#define RAY_FLAG_MASK 0x0F8 +/* We use RAY_STATE_MASK to get ray_state */ +#define RAY_STATE_MASK 0x0F +#define RAY_FLAG_MASK 0xF0 enum RayState { RAY_INVALID = 0, /* Denotes ray is actively involved in path-iteration. */ @@ -1341,14 +1430,25 @@ enum RayState { RAY_TO_REGENERATE, /* Denotes ray has been regenerated */ RAY_REGENERATED, - /* Flag's ray has to execute shadow blocked function in AO part */ - RAY_SHADOW_RAY_CAST_AO = 16, - /* Flag's ray has to execute shadow blocked function in direct lighting part. */ - RAY_SHADOW_RAY_CAST_DL = 32, + /* Denotes ray is moving to next iteration of the branched indirect loop */ + RAY_LIGHT_INDIRECT_NEXT_ITER, + RAY_VOLUME_INDIRECT_NEXT_ITER, + RAY_SUBSURFACE_INDIRECT_NEXT_ITER, + + /* Ray flags */ + + /* Flags to denote that the ray is currently evaluating the branched indirect loop */ + RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4), + RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5), + RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6), + RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT), + + /* Ray is evaluating an iteration of an indirect loop for another thread */ + RAY_BRANCHED_INDIRECT_SHARED = (1 << 7), }; #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state)) -#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state) +#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state)) #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag)) #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag))) #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag) diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index 9c0878249d4..1e472aaf51a 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -660,6 +660,7 @@ typedef struct VolumeSegment { * but the entire segment is needed to do always scattering, rather than probabilistically * hitting or missing the volume. if we don't know the transmittance at the end of the * volume we can't generate stratified distance samples up to that transmittance */ +#ifdef __VOLUME_DECOUPLED__ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous) { @@ -829,6 +830,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s #endif } } +#endif /* __VOLUME_DECOUPLED__ */ /* scattering for homogeneous and heterogeneous volumes, using decoupled ray * marching. diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp new file mode 100644 index 00000000000..2ff1a392dc3 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp new file mode 100644 index 00000000000..4a9e6047ecf --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp new file mode 100644 index 00000000000..c22ec576254 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h new file mode 100644 index 00000000000..2ed713299fd --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -0,0 +1,138 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common declaration part of all CPU kernels. */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleV, + float *sampleVV, + float *bufferV, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance); + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, + float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance); + +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int *rect, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r); + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* rect, + int pass_stride, + int radius, + float pca_threshold); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weight_image, + float *variance, + float *difference_image, + int* rect, + int w, + int channel_offset, + float a, + float k_2); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *difference_image, + float *image, + float *out_image, + float *accum_image, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *difference_image, + float *buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, + int* rect, + int w); + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample); + +#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h new file mode 100644 index 00000000000..8dc1a8d583c --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -0,0 +1,272 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common implementation part of all CPU kernels. + * + * The idea is that particular .cpp files sets needed optimization flags and + * simply includes this file without worry of copying actual implementation over. + */ + +#include "kernel/kernel_compat_cpu.h" + +#include "kernel/filter/filter_kernel.h" + +#ifdef KERNEL_STUB +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif + +CCL_NAMESPACE_BEGIN + + +/* Denoise filter */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); +#else + kernel_filter_divide_shadow(sample, tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_get_feature); +#else + kernel_filter_get_feature(sample, tiles, + m_offset, v_offset, + x, y, + mean, variance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int *rect, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers); +#else + kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_combine_halves); +#else + kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* prefilter_rect, + int pass_stride, + int radius, + float pca_threshold) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); +#else + rank += storage_ofs; + transform += storage_ofs*TRANSFORM_SIZE; + kernel_filter_construct_transform(buffer, + x, y, + load_int4(prefilter_rect), + pass_stride, + transform, + rank, + radius, + pca_threshold); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weight_image, + float *variance, + float *difference_image, + int *rect, + int w, + int channel_offset, + float a, + float k_2) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); +#else + kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); +#else + kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); +#else + kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *difference_image, + float *image, + float *out_image, + float *accum_image, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); +#else + kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *difference_image, + float *buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); +#else + kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, + int *rect, + int w) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); +#else + kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_finalize); +#else + XtWX += storage_ofs*XTWX_SIZE; + XtWY += storage_ofs*XTWY_SIZE; + rank += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); +#endif +} + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp new file mode 100644 index 00000000000..f7c9935f1d0 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp new file mode 100644 index 00000000000..070b95a3505 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp new file mode 100644 index 00000000000..1a7b2040da1 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp index 16992c681e6..998619ac897 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp @@ -95,9 +95,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_float4")) { texture_image_float4 *tex = NULL; int id = atoi(name + strlen("__tex_image_float4_")); - int array_index = id; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_FLOAT4_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_float4_images.size()) { + kg->texture_float4_images.resize(array_index+1); + } tex = &kg->texture_float4_images[array_index]; } @@ -111,9 +114,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_float")) { texture_image_float *tex = NULL; int id = atoi(name + strlen("__tex_image_float_")); - int array_index = id - TEX_START_FLOAT_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_FLOAT_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_float_images.size()) { + kg->texture_float_images.resize(array_index+1); + } tex = &kg->texture_float_images[array_index]; } @@ -127,9 +133,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_byte4")) { texture_image_uchar4 *tex = NULL; int id = atoi(name + strlen("__tex_image_byte4_")); - int array_index = id - TEX_START_BYTE4_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_BYTE4_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_byte4_images.size()) { + kg->texture_byte4_images.resize(array_index+1); + } tex = &kg->texture_byte4_images[array_index]; } @@ -143,9 +152,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_byte")) { texture_image_uchar *tex = NULL; int id = atoi(name + strlen("__tex_image_byte_")); - int array_index = id - TEX_START_BYTE_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_BYTE_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_byte_images.size()) { + kg->texture_byte_images.resize(array_index+1); + } tex = &kg->texture_byte_images[array_index]; } @@ -159,9 +171,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_half4")) { texture_image_half4 *tex = NULL; int id = atoi(name + strlen("__tex_image_half4_")); - int array_index = id - TEX_START_HALF4_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_HALF4_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_half4_images.size()) { + kg->texture_half4_images.resize(array_index+1); + } tex = &kg->texture_half4_images[array_index]; } @@ -175,9 +190,12 @@ void kernel_tex_copy(KernelGlobals *kg, else if(strstr(name, "__tex_image_half")) { texture_image_half *tex = NULL; int id = atoi(name + strlen("__tex_image_half_")); - int array_index = id - TEX_START_HALF_CPU; + int array_index = kernel_tex_index(id); - if(array_index >= 0 && array_index < TEX_NUM_HALF_CPU) { + if(array_index >= 0) { + if(array_index >= kg->texture_half_images.size()) { + kg->texture_half_images.resize(array_index+1); + } tex = &kg->texture_half_images[array_index]; } diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp index 2600d977972..a645fb4d8dd 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp @@ -17,21 +17,23 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp index dba15d037ac..6bbb87727b9 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp @@ -18,21 +18,23 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 896b80d783e..c8938534fe8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -77,16 +77,17 @@ DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) DECLARE_SPLIT_KERNEL_FUNCTION(do_volume) DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort) DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive) DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) -void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)); - #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h index af68907a5c2..f6bb4c25012 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h @@ -23,51 +23,59 @@ CCL_NAMESPACE_BEGIN ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp(x, y); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp(x, y); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp(x, y); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp(x, y); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp(x, y); - else - return kg->texture_float4_images[tex].interp(x, y); + switch(kernel_tex_type(tex)) { + case IMAGE_DATA_TYPE_HALF: + return kg->texture_half_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_BYTE: + return kg->texture_byte_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_FLOAT: + return kg->texture_float_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_HALF4: + return kg->texture_half4_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_BYTE4: + return kg->texture_byte4_images[kernel_tex_index(tex)].interp(x, y); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return kg->texture_float4_images[kernel_tex_index(tex)].interp(x, y); + } } ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d(x, y, z); - else - return kg->texture_float4_images[tex].interp_3d(x, y, z); - + switch(kernel_tex_type(tex)) { + case IMAGE_DATA_TYPE_HALF: + return kg->texture_half_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_BYTE: + return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_FLOAT: + return kg->texture_float_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_HALF4: + return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_BYTE4: + return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d(x, y, z); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d(x, y, z); + } } ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d_ex(x, y, z, interpolation); - else - return kg->texture_float4_images[tex].interp_3d_ex(x, y, z, interpolation); + switch(kernel_tex_type(tex)) { + case IMAGE_DATA_TYPE_HALF: + return kg->texture_half_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_BYTE: + return kg->texture_byte_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_FLOAT: + return kg->texture_float_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_HALF4: + return kg->texture_half4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_BYTE4: + return kg->texture_byte4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return kg->texture_float4_images[kernel_tex_index(tex)].interp_3d_ex(x, y, z, interpolation); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 148b2eef568..d4315ee5ec4 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -22,38 +22,50 @@ #include "kernel/kernel_compat_cpu.h" -#ifndef __SPLIT_KERNEL__ -# include "kernel/kernel_math.h" -# include "kernel/kernel_types.h" - -# include "kernel/split/kernel_split_data.h" -# include "kernel/kernel_globals.h" - -# include "kernel/kernels/cpu/kernel_cpu_image.h" -# include "kernel/kernel_film.h" -# include "kernel/kernel_path.h" -# include "kernel/kernel_path_branched.h" -# include "kernel/kernel_bake.h" +#ifndef KERNEL_STUB +# ifndef __SPLIT_KERNEL__ +# include "kernel/kernel_math.h" +# include "kernel/kernel_types.h" + +# include "kernel/split/kernel_split_data.h" +# include "kernel/kernel_globals.h" + +# include "kernel/kernels/cpu/kernel_cpu_image.h" +# include "kernel/kernel_film.h" +# include "kernel/kernel_path.h" +# include "kernel/kernel_path_branched.h" +# include "kernel/kernel_bake.h" +# else +# include "kernel/split/kernel_split_common.h" + +# include "kernel/split/kernel_data_init.h" +# include "kernel/split/kernel_path_init.h" +# include "kernel/split/kernel_scene_intersect.h" +# include "kernel/split/kernel_lamp_emission.h" +# include "kernel/split/kernel_do_volume.h" +# include "kernel/split/kernel_queue_enqueue.h" +# include "kernel/split/kernel_indirect_background.h" +# include "kernel/split/kernel_shader_setup.h" +# include "kernel/split/kernel_shader_sort.h" +# include "kernel/split/kernel_shader_eval.h" +# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" +# include "kernel/split/kernel_subsurface_scatter.h" +# include "kernel/split/kernel_direct_lighting.h" +# include "kernel/split/kernel_shadow_blocked_ao.h" +# include "kernel/split/kernel_shadow_blocked_dl.h" +# include "kernel/split/kernel_enqueue_inactive.h" +# include "kernel/split/kernel_next_iteration_setup.h" +# include "kernel/split/kernel_indirect_subsurface.h" +# include "kernel/split/kernel_buffer_update.h" +# endif /* __SPLIT_KERNEL__ */ #else -# include "kernel/split/kernel_split_common.h" - -# include "kernel/split/kernel_data_init.h" -# include "kernel/split/kernel_path_init.h" -# include "kernel/split/kernel_scene_intersect.h" -# include "kernel/split/kernel_lamp_emission.h" -# include "kernel/split/kernel_do_volume.h" -# include "kernel/split/kernel_queue_enqueue.h" -# include "kernel/split/kernel_indirect_background.h" -# include "kernel/split/kernel_shader_eval.h" -# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -# include "kernel/split/kernel_subsurface_scatter.h" -# include "kernel/split/kernel_direct_lighting.h" -# include "kernel/split/kernel_shadow_blocked_ao.h" -# include "kernel/split/kernel_shadow_blocked_dl.h" -# include "kernel/split/kernel_next_iteration_setup.h" -# include "kernel/split/kernel_indirect_subsurface.h" -# include "kernel/split/kernel_buffer_update.h" -#endif +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) + +# ifdef __SPLIT_KERNEL__ +# include "kernel/split/kernel_data_init.h" +# endif /* __SPLIT_KERNEL__ */ +#endif /* KERNEL_STUB */ CCL_NAMESPACE_BEGIN @@ -69,7 +81,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, int offset, int stride) { -#ifdef __BRANCHED_PATH__ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, path_trace); +#else +# ifdef __BRANCHED_PATH__ if(kernel_data.integrator.branched) { kernel_branched_path_trace(kg, buffer, @@ -80,10 +95,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, stride); } else -#endif +# endif { kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +#endif /* KERNEL_STUB */ } /* Film */ @@ -96,6 +112,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_byte); +#else kernel_film_convert_to_byte(kg, rgba, buffer, @@ -103,6 +122,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, @@ -113,6 +133,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_half_float); +#else kernel_film_convert_to_half_float(kg, rgba, buffer, @@ -120,6 +143,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } /* Shader Evaluate */ @@ -134,9 +158,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, int offset, int sample) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader); +#else if(type >= SHADER_EVAL_BAKE) { kernel_assert(output_luma == NULL); -#ifdef __BAKING__ +# ifdef __BAKING__ kernel_bake_evaluate(kg, input, output, @@ -145,7 +172,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, offset, sample); -#endif +# endif } else { kernel_shader_evaluate(kg, @@ -156,24 +183,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, sample); } +#endif /* KERNEL_STUB */ } #else /* __SPLIT_KERNEL__ */ /* Split Kernel Path Tracing */ -#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ +#ifdef KERNEL_STUB +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } + +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } +#else +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ { \ kernel_##name(kg); \ } -#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ { \ ccl_local type locals; \ kernel_##name(kg, &locals); \ } +#endif /* KERNEL_STUB */ DEFINE_SPLIT_KERNEL_FUNCTION(path_init) DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) @@ -181,49 +223,22 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) - -void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)) -{ -#define REGISTER_NAME_STRING(name) #name -#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name) -#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name)); - - REGISTER(path_trace); - REGISTER(convert_to_byte); - REGISTER(convert_to_half_float); - REGISTER(shader); - - REGISTER(data_init); - REGISTER(path_init); - REGISTER(scene_intersect); - REGISTER(lamp_emission); - REGISTER(do_volume); - REGISTER(queue_enqueue); - REGISTER(indirect_background); - REGISTER(shader_eval); - REGISTER(holdout_emission_blurring_pathtermination_ao); - REGISTER(subsurface_scatter); - REGISTER(direct_lighting); - REGISTER(shadow_blocked_ao); - REGISTER(shadow_blocked_dl); - REGISTER(next_iteration_setup); - REGISTER(indirect_subsurface); - REGISTER(buffer_update); - -#undef REGISTER -#undef REGISTER_EVAL_NAME -#undef REGISTER_NAME_STRING -} - #endif /* __SPLIT_KERNEL__ */ +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp index 27a746a0799..6ba3425a343 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -17,22 +17,25 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp index 364d279a189..76b2d77ebb8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -18,23 +18,25 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp index 0afb481296f..b468b6f44c8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp index 13d00813591..3e5792d0b17 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -18,19 +18,21 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp index a4312071edc..3629f21cd29 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -18,20 +18,22 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp index 1acfaa91ac9..57530c88710 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp @@ -18,15 +18,17 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp index f7b6a2e21fe..c607753bc4b 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp index 1900c6e3012..a278554731c 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp @@ -18,18 +18,20 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu//kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu new file mode 100644 index 00000000000..009c3fde9d5 --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -0,0 +1,255 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#include "kernel_config.h" + +#include "kernel/kernel_compat_cuda.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_divide_shadow(int sample, + TilesInfo *tiles, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_get_feature(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + float *mean, + float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_detect_outliers(float *image, + float *variance, + float *depth, + float *output, + int4 prefilter_rect, + int pass_stride) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, + float *transform, int *rank, + int4 filter_area, int4 rect, + int radius, float pca_threshold, + int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int *l_rank = rank + y*filter_area.z + x; + float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_difference(int dx, int dy, + const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, + int4 rect, int w, + int channel_offset, + float a, float k_2) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_update_output(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, float *accum_image, + int4 rect, int w, + int f) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, + float const* __restrict__ transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); + int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + difference_image, + buffer, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_finalize(int w, int h, + float *buffer, int *rank, + float *XtWX, float3 *XtWY, + int4 filter_area, int4 buffer_params, + int sample) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index a679eff8409..628891b1458 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -31,12 +31,15 @@ #include "kernel/split/kernel_do_volume.h" #include "kernel/split/kernel_queue_enqueue.h" #include "kernel/split/kernel_indirect_background.h" +#include "kernel/split/kernel_shader_setup.h" +#include "kernel/split/kernel_shader_sort.h" #include "kernel/split/kernel_shader_eval.h" #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" #include "kernel/split/kernel_subsurface_scatter.h" #include "kernel/split/kernel_direct_lighting.h" #include "kernel/split/kernel_shadow_blocked_ao.h" #include "kernel/split/kernel_shadow_blocked_dl.h" +#include "kernel/split/kernel_enqueue_inactive.h" #include "kernel/split/kernel_next_iteration_setup.h" #include "kernel/split/kernel_indirect_subsurface.h" #include "kernel/split/kernel_buffer_update.h" @@ -108,12 +111,15 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl new file mode 100644 index 00000000000..ba53ba4b26f --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -0,0 +1,280 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* OpenCL kernel entry points */ + +#include "kernel/kernel_compat_opencl.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +__kernel void kernel_ocl_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + char use_split_variance) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +__kernel void kernel_ocl_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, + int v_offset, + ccl_global float *mean, + ccl_global float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + char use_split_variance) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int4 prefilter_rect, + int pass_stride) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); + } +} + +__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 prefilter_rect, + int r) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + ccl_global float *transform, + ccl_global int *rank, + int4 filter_area, + int4 rect, + int pass_stride, + int radius, + float pca_threshold) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + ccl_global int *l_rank = rank + y*filter_area.z + x; + ccl_global float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_difference(int dx, + int dy, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); + } +} + +__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, + int w, + int f) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, + int w, + int f) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_update_output(int dx, + int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, + int4 rect, + int w, + int f) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, + int4 rect, + int w) +{ + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); + } +} + +__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, + int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, + int h, + int f, + int pass_stride) +{ + int x = get_global_id(0) + max(0, rect.x-filter_rect.x); + int y = get_global_id(1) + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + difference_image, + buffer, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_finalize(int w, + int h, + ccl_global float *buffer, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 filter_area, + int4 buffer_params, + int sample) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles, + ccl_global float *buffer_1, + ccl_global float *buffer_2, + ccl_global float *buffer_3, + ccl_global float *buffer_4, + ccl_global float *buffer_5, + ccl_global float *buffer_6, + ccl_global float *buffer_7, + ccl_global float *buffer_8, + ccl_global float *buffer_9) +{ + if((get_global_id(0) == 0) && (get_global_id(1) == 0)) { + tiles->buffers[0] = buffer_1; + tiles->buffers[1] = buffer_2; + tiles->buffers[2] = buffer_3; + tiles->buffers[3] = buffer_4; + tiles->buffers[4] = buffer_5; + tiles->buffers[5] = buffer_6; + tiles->buffers[6] = buffer_7; + tiles->buffers[7] = buffer_8; + tiles->buffers[8] = buffer_9; + } +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl index db65c91baf7..dcea2630aef 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_buffer_update.h" -__kernel void kernel_ocl_path_trace_buffer_update( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME buffer_update +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl index eb34f750881..ed64ae01aae 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_direct_lighting.h" -__kernel void kernel_ocl_path_trace_direct_lighting( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME direct_lighting +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl index 83ef5f5f3f2..8afaa686e28 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_do_volume.h" -__kernel void kernel_ocl_path_trace_do_volume( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_do_volume((KernelGlobals*)kg); -} +#define KERNEL_NAME do_volume +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl new file mode 100644 index 00000000000..e68d4104a91 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_enqueue_inactive.h" + +#define KERNEL_NAME enqueue_inactive +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl index d071b39aa6f..9e1e57beba6 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -18,12 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local BackgroundAOLocals locals; - kernel_holdout_emission_blurring_pathtermination_ao( - (KernelGlobals*)kg, - &locals); -} +#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao +#define LOCALS_TYPE BackgroundAOLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl index 8c213ff5cb2..192d01444ba 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_indirect_background.h" -__kernel void kernel_ocl_path_trace_indirect_background( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_indirect_background((KernelGlobals*)kg); -} +#define KERNEL_NAME indirect_background +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl index 998ebc4c0c3..84938b889e5 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_indirect_subsurface.h" -__kernel void kernel_ocl_path_trace_indirect_subsurface( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_indirect_subsurface((KernelGlobals*)kg); -} +#define KERNEL_NAME indirect_subsurface +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl index 822d2287715..c314dc96c33 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_lamp_emission.h" -__kernel void kernel_ocl_path_trace_lamp_emission( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_lamp_emission((KernelGlobals*)kg); -} +#define KERNEL_NAME lamp_emission +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl index 6d207253a40..8b1332bf013 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_next_iteration_setup.h" -__kernel void kernel_ocl_path_trace_next_iteration_setup( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME next_iteration_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl index bd9aa9538c8..fa210e747c0 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_path_init.h" -__kernel void kernel_ocl_path_trace_path_init( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_path_init((KernelGlobals*)kg); -} +#define KERNEL_NAME path_init +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl index 9be154e3d75..68ee6f1d536 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl @@ -18,10 +18,9 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_queue_enqueue.h" -__kernel void kernel_ocl_path_trace_queue_enqueue( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local QueueEnqueueLocals locals; - kernel_queue_enqueue((KernelGlobals*)kg, &locals); -} +#define KERNEL_NAME queue_enqueue +#define LOCALS_TYPE QueueEnqueueLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl index eb4fb4d153a..10d09377ba9 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_scene_intersect.h" -__kernel void kernel_ocl_path_trace_scene_intersect( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_scene_intersect((KernelGlobals*)kg); -} +#define KERNEL_NAME scene_intersect +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index 6baee460986..40eaa561863 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -18,10 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shader_eval.h" -__kernel void kernel_ocl_path_trace_shader_eval( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME shader_eval +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl new file mode 100644 index 00000000000..8c36100f762 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_setup.h" + +#define KERNEL_NAME shader_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl new file mode 100644 index 00000000000..bcacaa4a054 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl @@ -0,0 +1,27 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_sort.h" + +__attribute__((reqd_work_group_size(64, 1, 1))) +#define KERNEL_NAME shader_sort +#define LOCALS_TYPE ShaderSortLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl index 6a8ef81b32a..8de250a375c 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shadow_blocked_ao.h" -__kernel void kernel_ocl_path_trace_shadow_blocked_ao( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_shadow_blocked_ao((KernelGlobals*)kg); -} +#define KERNEL_NAME shadow_blocked_ao +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl index b255cc5ef8b..29da77022ed 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl @@ -18,9 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_shadow_blocked_dl.h" -__kernel void kernel_ocl_path_trace_shadow_blocked_dl( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - kernel_shadow_blocked_dl((KernelGlobals*)kg); -} +#define KERNEL_NAME shadow_blocked_dl +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl index 732cda30115..651addb02f4 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl @@ -23,12 +23,15 @@ #include "kernel/kernels/opencl/kernel_do_volume.cl" #include "kernel/kernels/opencl/kernel_indirect_background.cl" #include "kernel/kernels/opencl/kernel_queue_enqueue.cl" +#include "kernel/kernels/opencl/kernel_shader_setup.cl" +#include "kernel/kernels/opencl/kernel_shader_sort.cl" #include "kernel/kernels/opencl/kernel_shader_eval.cl" #include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" #include "kernel/kernels/opencl/kernel_subsurface_scatter.cl" #include "kernel/kernels/opencl/kernel_direct_lighting.cl" #include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl" #include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl" +#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl" #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl" #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl" #include "kernel/kernels/opencl/kernel_buffer_update.cl" diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h new file mode 100644 index 00000000000..f1e914a70d4 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h @@ -0,0 +1,72 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define KERNEL_NAME_JOIN(a, b) a ## _ ## b +#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b) + +__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( + ccl_global char *kg_global, + ccl_constant KernelData *data, + + ccl_global void *split_data_buffer, + ccl_global char *ray_state, + ccl_global uint *rng_state, + +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, +#include "kernel/kernel_textures.h" + + ccl_global int *queue_index, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pools, + ccl_global float *buffer + ) +{ +#ifdef LOCALS_TYPE + ccl_local LOCALS_TYPE locals; +#endif + + KernelGlobals *kg = (KernelGlobals*)kg_global; + + if(ccl_local_id(0) + ccl_local_id(1) == 0) { + kg->data = data; + + kernel_split_params.rng_state = rng_state; + kernel_split_params.queue_index = queue_index; + kernel_split_params.use_queues_flag = use_queues_flag; + kernel_split_params.work_pools = work_pools; + kernel_split_params.buffer = buffer; + + split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state); + +#define KERNEL_TEX(type, ttype, name) \ + kg->name = name; +#include "kernel/kernel_textures.h" + } + + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + KERNEL_NAME_EVAL(kernel, KERNEL_NAME)( + kg +#ifdef LOCALS_TYPE + , &locals +#endif + ); +} + +#undef KERNEL_NAME_JOIN +#undef KERNEL_NAME_EVAL + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl index 7a1838e485f..2b3be38df84 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl @@ -18,10 +18,7 @@ #include "kernel/split/kernel_split_common.h" #include "kernel/split/kernel_subsurface_scatter.h" -__kernel void kernel_ocl_path_trace_subsurface_scatter( - ccl_global char *kg, - ccl_constant KernelData *data) -{ - ccl_local unsigned int local_queue_atomics; - kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics); -} +#define KERNEL_NAME subsurface_scatter +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 95beea01d25..27a96720c1e 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -39,7 +39,9 @@ #include "kernel/kernel_montecarlo.h" #include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" #include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_principled_diffuse.h" #include "kernel/closure/bssrdf.h" CCL_NAMESPACE_BEGIN @@ -78,6 +80,7 @@ public: bssrdf->albedo = albedo.x; bssrdf->sharpness = sharpness; bssrdf->N = params.N; + bssrdf->roughness = params.roughness; sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } @@ -89,6 +92,7 @@ public: bssrdf->albedo = albedo.y; bssrdf->sharpness = sharpness; bssrdf->N = params.N; + bssrdf->roughness = params.roughness; sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } @@ -100,6 +104,7 @@ public: bssrdf->albedo = albedo.z; bssrdf->sharpness = sharpness; bssrdf->N = params.N; + bssrdf->roughness = params.roughness; sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } } @@ -180,5 +185,31 @@ ClosureParam *closure_bssrdf_burley_params() CCLOSURE_PREPARE(closure_bssrdf_burley_prepare, BurleyBSSRDFClosure) +/* Disney principled */ + +class PrincipledBSSRDFClosure : public CBSSRDFClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID); + } +}; + +ClosureParam *closure_bssrdf_principled_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, params.N), + CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, radius), + CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.texture_blur), + CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, albedo), + CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.roughness), + CLOSURE_STRING_KEYPARAM(PrincipledBSSRDFClosure, label, "label"), + CLOSURE_FINISH_PARAM(PrincipledBSSRDFClosure) + }; + return params; +} + +CCLOSURE_PREPARE(closure_bssrdf_principled_prepare, PrincipledBSSRDFClosure) + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index f44714c2150..14c5c1c3db5 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -60,6 +60,8 @@ #include "kernel/closure/bsdf_ashikhmin_shirley.h" #include "kernel/closure/bsdf_toon.h" #include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" #include "kernel/closure/volume.h" CCL_NAMESPACE_BEGIN @@ -154,7 +156,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction) BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), @@ -162,7 +164,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection) BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), @@ -176,6 +178,63 @@ VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein) VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR) VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption) +BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse, principled_diffuse, PrincipledDiffuseBsdf, LABEL_DIFFUSE) + CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N), + CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness), +BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse) + +BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen, principled_sheen, PrincipledSheenBsdf, LABEL_DIFFUSE) + CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N), +BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen) + +/* DISNEY PRINCIPLED CLEARCOAT */ +class PrincipledClearcoatClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float clearcoat, clearcoat_roughness; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf && extra) { + bsdf->extra = extra; + + bsdf->ior = 1.5f; + + bsdf->alpha_x = clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness; + + bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); + bsdf->extra->clearcoat = clearcoat; + + return bsdf; + } + + return NULL; + } + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_principled_clearcoat_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N), + CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat), + CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness), + CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"), + CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure) + + /* Registration */ static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, OSL::ClosureParam *params, OSL::PrepareClosureFunc prepare) @@ -215,6 +274,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bsdf_microfacet_multi_ggx_glass_params(), closure_bsdf_microfacet_multi_ggx_glass_prepare); register_closure(ss, "microfacet_multi_ggx_aniso", id++, closure_bsdf_microfacet_multi_ggx_aniso_params(), closure_bsdf_microfacet_multi_ggx_aniso_prepare); + register_closure(ss, "microfacet_ggx_fresnel", id++, + closure_bsdf_microfacet_ggx_fresnel_params(), closure_bsdf_microfacet_ggx_fresnel_prepare); + register_closure(ss, "microfacet_ggx_aniso_fresnel", id++, + closure_bsdf_microfacet_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_ggx_aniso_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_fresnel_params(), closure_bsdf_microfacet_multi_ggx_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_glass_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(), closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_aniso_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare); register_closure(ss, "microfacet_beckmann", id++, bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare); register_closure(ss, "microfacet_beckmann_aniso", id++, @@ -229,6 +298,12 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare); register_closure(ss, "glossy_toon", id++, bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare); + register_closure(ss, "principled_diffuse", id++, + bsdf_principled_diffuse_params(), bsdf_principled_diffuse_prepare); + register_closure(ss, "principled_sheen", id++, + bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare); + register_closure(ss, "principled_clearcoat", id++, + closure_bsdf_principled_clearcoat_params(), closure_bsdf_principled_clearcoat_prepare); register_closure(ss, "emission", id++, closure_emission_params(), closure_emission_prepare); @@ -248,6 +323,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare); register_closure(ss, "bssrdf_burley", id++, closure_bssrdf_burley_params(), closure_bssrdf_burley_prepare); + register_closure(ss, "bssrdf_principled", id++, + closure_bssrdf_principled_params(), closure_bssrdf_principled_prepare); register_closure(ss, "hair_reflection", id++, bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare); @@ -278,6 +355,86 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering) return false; } + +/* GGX closures with Fresnel */ + +class MicrofacetFresnelClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float3 color; + float3 cspec0; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + /* Technically, the MultiGGX Glass closure may also transmit. However, + * since this is set statically and only used for caustic flags, this + * is probably as good as it gets. */ + if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf && extra) { + bsdf->extra = extra; + bsdf->extra->color = color; + bsdf->extra->cspec0 = cspec0; + return bsdf; + } + } + + return NULL; + } +}; + +class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure); + +class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare, MicrofacetGGXAnisoFresnelClosure); + + /* Multiscattering GGX closures */ class MicrofacetMultiClosure : public CBSDFClosure { @@ -287,7 +444,7 @@ public: MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) { - /* Technically, the MultiGGX Glass closure may also transmit. However, + /* Technically, the MultiGGX closure may also transmit. However, * since this is set statically and only used for caustic flags, this * is probably as good as it gets. */ if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) { @@ -375,5 +532,110 @@ ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params() } CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure); + +/* Multiscattering GGX closures with Fresnel */ + +class MicrofacetMultiFresnelClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float3 color; + float3 cspec0; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + /* Technically, the MultiGGX closure may also transmit. However, + * since this is set statically and only used for caustic flags, this + * is probably as good as it gets. */ + if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf && extra) { + bsdf->extra = extra; + bsdf->extra->color = color; + bsdf->extra->cspec0 = cspec0; + return bsdf; + } + } + + return NULL; + } +}; + +class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare, MicrofacetMultiGGXFresnelClosure); + +class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare, MicrofacetMultiGGXAnisoFresnelClosure); + +class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure() {} + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd) : 0; + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare, MicrofacetMultiGGXGlassFresnelClosure); + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index 929cf00a7e6..ff5fd9cc905 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -51,10 +51,17 @@ OSL::ClosureParam *closure_bsdf_phong_ramp_params(); OSL::ClosureParam *closure_bssrdf_cubic_params(); OSL::ClosureParam *closure_bssrdf_gaussian_params(); OSL::ClosureParam *closure_bssrdf_burley_params(); +OSL::ClosureParam *closure_bssrdf_principled_params(); OSL::ClosureParam *closure_henyey_greenstein_volume_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params(); +OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(); +OSL::ClosureParam *closure_bsdf_principled_clearcoat_params(); void closure_emission_prepare(OSL::RendererServices *, int id, void *data); void closure_background_prepare(OSL::RendererServices *, int id, void *data); @@ -65,10 +72,17 @@ void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data); void closure_bssrdf_burley_prepare(OSL::RendererServices *, int id, void *data); +void closure_bssrdf_principled_prepare(OSL::RendererServices *, int id, void *data); void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data); #define CCLOSURE_PREPARE(name, classname) \ void name(RendererServices *, int id, void *data) \ diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index b767c60c617..1535496c73d 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -824,7 +824,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val) { - if(sg->renderstate == NULL) + if(sg == NULL || sg->renderstate == NULL) return false; ShaderData *sd = (ShaderData *)(sg->renderstate); diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt index b43f8402d42..1a8ed4c884a 100644 --- a/intern/cycles/kernel/shaders/CMakeLists.txt +++ b/intern/cycles/kernel/shaders/CMakeLists.txt @@ -81,13 +81,15 @@ set(SRC_OSL node_wireframe.osl node_hair_bsdf.osl node_uv_map.osl + node_principled_bsdf.osl node_rgb_to_bw.osl ) set(SRC_OSL_HEADERS - node_texture.h node_color.h node_fresnel.h + node_ramp_util.h + node_texture.h stdosl.h oslutil.h ) diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl new file mode 100644 index 00000000000..6870d479af3 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl @@ -0,0 +1,120 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" +#include "node_fresnel.h" + +shader node_principled_bsdf( + string distribution = "Multiscatter GGX", + color BaseColor = color(0.8, 0.8, 0.8), + float Subsurface = 0.0, + vector SubsurfaceRadius = vector(1.0, 1.0, 1.0), + color SubsurfaceColor = color(0.7, 0.1, 0.1), + float Metallic = 0.0, + float Specular = 0.5, + float SpecularTint = 0.0, + float Roughness = 0.5, + float Anisotropic = 0.0, + float AnisotropicRotation = 0.0, + float Sheen = 0.0, + float SheenTint = 0.5, + float Clearcoat = 0.0, + float ClearcoatRoughness = 0.03, + float IOR = 1.45, + float Transmission = 0.0, + float TransmissionRoughness = 0.0, + normal Normal = N, + normal ClearcoatNormal = N, + normal Tangent = normalize(dPdu), + output closure color BSDF = 0) +{ + float f = max(IOR, 1e-5); + float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0)); + float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0)); + float specular_weight = (1.0 - final_transmission); + + vector T = Tangent; + + float m_cdlum = luminance(BaseColor); + color m_ctint = m_cdlum > 0.0 ? BaseColor / m_cdlum : color(0.0, 0.0, 0.0); // normalize lum. to isolate hue+sat + + /* rotate tangent */ + if (AnisotropicRotation != 0.0) + T = rotate(T, AnisotropicRotation * M_2PI, point(0.0, 0.0, 0.0), Normal); + + if (diffuse_weight > 1e-5) { + if (Subsurface > 1e-5) { + color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface); + BSDF = mixed_ss_base_color * bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, SubsurfaceColor, Roughness); + } else { + BSDF = BaseColor * principled_diffuse(Normal, Roughness); + } + + if (Sheen > 1e-5) { + color sheen_color = color(1.0, 1.0, 1.0) * (1.0 - SheenTint) + m_ctint * SheenTint; + + BSDF = BSDF + sheen_color * Sheen * principled_sheen(Normal); + } + + BSDF = BSDF * diffuse_weight; + } + + if (specular_weight > 1e-5) { + float aspect = sqrt(1.0 - Anisotropic * 0.9); + float r2 = Roughness * Roughness; + + float alpha_x = r2 / aspect; + float alpha_y = r2 * aspect; + + color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint; + + color Cspec0 = (Specular * 0.08 * tmp_col) * (1.0 - Metallic) + BaseColor * Metallic; + + if (distribution == "GGX" || Roughness <= 0.075) { + BSDF = BSDF + specular_weight * microfacet_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0); + } else { + BSDF = BSDF + specular_weight * microfacet_multi_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0); + } + } + + if (final_transmission > 1e-5) { + color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint); + float eta = backfacing() ? 1.0 / f : f; + + if (distribution == "GGX" || Roughness <= 5e-2) { + float cosNO = dot(Normal, I); + float Fr = fresnel_dielectric_cos(cosNO, eta); + + float refl_roughness = Roughness; + if (Roughness <= 1e-2) + refl_roughness = 0.0; + + float transmission_roughness = refl_roughness; + if (distribution == "GGX") + transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness); + + BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) + + (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta)); + } else { + BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0); + } + } + + if (Clearcoat > 1e-5) { + BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness); + } +} + diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h index a8dda8a12c9..c91d2918687 100644 --- a/intern/cycles/kernel/shaders/stdosl.h +++ b/intern/cycles/kernel/shaders/stdosl.h @@ -530,6 +530,11 @@ closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN; closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN; closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN; closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN; +closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; closure color microfacet_beckmann(normal N, float ab) BUILTIN; closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN; closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN; @@ -539,11 +544,15 @@ closure color emission() BUILTIN; closure color background() BUILTIN; closure color holdout() BUILTIN; closure color ambient_occlusion() BUILTIN; +closure color principled_diffuse(normal N, float roughness) BUILTIN; +closure color principled_sheen(normal N) BUILTIN; +closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN; // BSSRDF closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN; closure color bssrdf_gaussian(normal N, vector radius, float texture_blur) BUILTIN; closure color bssrdf_burley(normal N, vector radius, float texture_blur, color albedo) BUILTIN; +closure color bssrdf_principled(normal N, vector radius, float texture_blur, color subsurface_color, float roughness) BUILTIN; // Hair closure color hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN; diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h new file mode 100644 index 00000000000..e2762a85fc8 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_branched.h @@ -0,0 +1,220 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __BRANCHED_PATH__ + +/* sets up the various state needed to do an indirect loop */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* save a copy of the state to restore later */ +#define BRANCHED_STORE(name) \ + branched_state->name = kernel_split_state.name[ray_index]; + + BRANCHED_STORE(path_state); + BRANCHED_STORE(throughput); + BRANCHED_STORE(ray); + BRANCHED_STORE(sd); + BRANCHED_STORE(isect); + BRANCHED_STORE(ray_state); + +#undef BRANCHED_STORE + + /* set loop counters to intial position */ + branched_state->next_closure = 0; + branched_state->next_sample = 0; +} + +/* ends an indirect loop and restores the previous state */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* restore state */ +#define BRANCHED_RESTORE(name) \ + kernel_split_state.name[ray_index] = branched_state->name; + + BRANCHED_RESTORE(path_state); + BRANCHED_RESTORE(throughput); + BRANCHED_RESTORE(ray); + BRANCHED_RESTORE(sd); + BRANCHED_RESTORE(isect); + BRANCHED_RESTORE(ray_state); + +#undef BRANCHED_RESTORE + + /* leave indirect loop */ + REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT); +} + +ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + + int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index); + + if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) { + return false; + } + +#define SPLIT_DATA_ENTRY(type, name, num) \ + kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; + SPLIT_DATA_ENTRIES_BRANCHED_SHARED +#undef SPLIT_DATA_ENTRY + + kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0; + kernel_split_state.branched_state[inactive_ray].original_ray = ray_index; + kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray]; + + path_radiance_init(inactive_L, kernel_data.film.use_light_pass); + inactive_L->direct_throughput = L->direct_throughput; + path_radiance_copy_indirect(inactive_L, L); + + ray_state[inactive_ray] = RAY_REGENERATED; + ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED); + ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)); + + atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count); + + return true; +} + +/* bounce off surface and integrate indirect light */ +ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg, + int ray_index, + float num_samples_adjust, + ShaderData *saved_sd, + bool reset_path_state, + bool wait_for_shared) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = saved_sd; + RNG rng = kernel_split_state.rng[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + float3 throughput = branched_state->throughput; + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(ps->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + + for(int i = branched_state->next_closure; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF(sc->type)) + continue; + /* transparency is not handled here, but in outer loop */ + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + continue; + + int num_samples; + + if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) + num_samples = kernel_data.integrator.diffuse_samples; + else if(CLOSURE_IS_BSDF_BSSRDF(sc->type)) + num_samples = 1; + else if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) + num_samples = kernel_data.integrator.glossy_samples; + else + num_samples = kernel_data.integrator.transmission_samples; + + num_samples = ceil_to_int(num_samples_adjust*num_samples); + + float num_samples_inv = num_samples_adjust/num_samples; + RNG bsdf_rng = cmj_hash(rng, i); + + for(int j = branched_state->next_sample; j < num_samples; j++) { + if(reset_path_state) { + *ps = branched_state->path_state; + } + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = throughput; + + ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index]; + + if(!kernel_branched_path_surface_bounce(kg, + &bsdf_rng, + sd, + sc, + j, + num_samples, + tp, + ps, + L, + bsdf_ray, + sum_sample_weight)) + { + continue; + } + + /* update state for next iteration */ + branched_state->next_closure = i; + branched_state->next_sample = j+1; + branched_state->num_samples = num_samples; + + /* start the indirect path */ + *tp *= num_samples_inv; + + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + + return true; + } + + branched_state->next_sample = 0; + } + + branched_state->next_closure = sd->num_closure; + + if(wait_for_shared) { + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + } + + return false; +} + +#endif /* __BRANCHED_PATH__ */ + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 859c221d976..4c1fdd2d69c 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -111,24 +111,14 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - float3 L_sum; -#ifdef __SHADOW_TRICKS__ - if(state->flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent); - } - else -#endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, L); - } - kernel_write_light_passes(kg, buffer, L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, state, debug_data, sample); #endif - float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L_rad); + bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER); + kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher); + path_rng_end(kg, rng_state, rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 9d3d01fff75..e4545d66eff 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -67,6 +67,10 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( unsigned int num_samples, ccl_global float *buffer) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, data_init); +#else + #ifdef __KERNEL_OPENCL__ kg->data = data; #endif @@ -105,21 +109,16 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( /* Initialize queue data and queue index. */ if(thread_index < queuesize) { - /* Initialize active ray queue. */ - kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize background and buffer update queue. */ - kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of AO queue. */ - kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of direct lighting queue. */ - kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + for(int i = 0; i < NUM_QUEUES; i++) { + kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + } } if(thread_index == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + for(int i = 0; i < NUM_QUEUES; i++) { + Queue_index[i] = 0; + } + /* The scene-intersect kernel should not use the queues very first time. * since the queue would be empty. */ @@ -148,6 +147,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( *(rng_state + index) = hash_int_2d(x, y); } } + +#endif /* KERENL_STUB */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index bdbf7387b95..3336c968a44 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -56,23 +56,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_params.queue_size, 0); -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; @@ -80,25 +63,24 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, /* direct lighting */ #ifdef __EMISSION__ RNG rng = kernel_split_state.rng[ray_index]; + bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)); + +# ifdef __BRANCHED_PATH__ + if(flag && kernel_data.integrator.branched) { + flag = false; + enqueue_flag = 1; + } +# endif /* __BRANCHED_PATH__ */ + # ifdef __SHADOW_TRICKS__ if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) { flag = false; - ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; - float3 throughput = kernel_split_state.throughput[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - kernel_branched_path_surface_connect_light(kg, - &rng, - sd, - emission_sd, - state, - throughput, - 1.0f, - L, - 1); + enqueue_flag = 1; } # endif /* __SHADOW_TRICKS__ */ + if(flag) { /* Sample illumination from lights to find path contribution. */ float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT); @@ -129,7 +111,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_state.bsdf_eval[ray_index] = L_light; kernel_split_state.is_lamp[ray_index] = is_lamp; /* Mark ray state for next shadow kernel. */ - ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); enqueue_flag = 1; } } @@ -138,10 +119,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, #endif /* __EMISSION__ */ } -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - #ifdef __EMISSION__ /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ enqueue_ray_index_local(ray_index, @@ -152,6 +129,27 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg, kernel_split_state.queue_data, kernel_split_params.queue_index); #endif + +#ifdef __BRANCHED_PATH__ + /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays + * this is the last kernel before next_iteration_setup that uses local atomics so we do this here + */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_LIGHT_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#endif /* __BRANCHED_PATH__ */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h index 47d3c280831..9f8dd2392d9 100644 --- a/intern/cycles/kernel/split/kernel_do_volume.h +++ b/intern/cycles/kernel/split/kernel_do_volume.h @@ -16,6 +16,100 @@ CCL_NAMESPACE_BEGIN +#if defined(__BRANCHED_PATH__) && defined(__VOLUME__) + +ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = &kernel_split_state.sd[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + /* GPU: no decoupled ray marching, scatter probalistically */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + Ray volume_ray = branched_state->ray; + volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack); + + for(int j = branched_state->next_sample; j < num_samples; j++) { + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + *ps = branched_state->path_state; + + ccl_global Ray *pray = &kernel_split_state.ray[ray_index]; + *pray = branched_state->ray; + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = branched_state->throughput * num_samples_inv; + + /* branch RNG state */ + path_state_branch(ps, j, num_samples); + + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L); + + /* indirect light bounce */ + if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) { + continue; + } + + /* start the indirect path */ + branched_state->next_closure = 0; + branched_state->next_sample = j+1; + branched_state->num_samples = num_samples; + + /* Attempting to share too many samples is slow for volumes as it causes us to + * loop here more and have many calls to kernel_volume_integrate which evaluates + * shaders. The many expensive shader evaluations cause the work load to become + * unbalanced and many threads to become idle in this kernel. Limiting the + * number of shared samples here helps quite a lot. + */ + if(branched_state->shared_sample_count < 2) { + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + } + + return true; + } +# endif + } + + branched_state->next_sample = num_samples; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + /* todo: avoid this calculation using decoupled ray marching */ + float3 throughput = kernel_split_state.throughput[ray_index]; + kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput); + kernel_split_state.throughput[ray_index] = throughput; + + return false; +} + +#endif /* __BRANCHED_PATH__ && __VOLUME__ */ ccl_device void kernel_do_volume(KernelGlobals *kg) { @@ -23,37 +117,36 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) /* We will empty this queue in this kernel. */ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; +# ifdef __BRANCHED_PATH__ + kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0; +# endif /* __BRANCHED_PATH__ */ } - /* Fetch use_queues_flag. */ - char local_use_queues_flag = *kernel_split_params.use_queues_flag; - ccl_barrier(CCL_LOCAL_MEM_FENCE); int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); - if(local_use_queues_flag) { + + if(*kernel_split_params.use_queues_flag) { ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, kernel_split_state.queue_data, kernel_split_params.queue_size, 1); - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } } - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) { + ccl_global char *ray_state = kernel_split_state.ray_state; - bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND); - - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; RNG rng = kernel_split_state.rng[ray_index]; ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; ShaderData *sd = &kernel_split_state.sd[ray_index]; - ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); /* Sanitize volume stack. */ if(!hit) { @@ -64,31 +157,68 @@ ccl_device void kernel_do_volume(KernelGlobals *kg) Ray volume_ray = *ray; volume_ray.t = (hit)? isect->t: FLT_MAX; - bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); +# ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +# endif /* __BRANCHED_PATH__ */ + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous); + { + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous); # ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L); - - /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED); - else - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER); + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_path_end(kg, ray_index); + } + } +# endif /* __VOLUME_SCATTER__ */ } -# endif + +# ifdef __BRANCHED_PATH__ } + else { + kernel_split_branched_path_volume_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ } + kernel_split_state.rng[ray_index] = rng; } -#endif +# ifdef __BRANCHED_PATH__ + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_VOLUME_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ + +#endif /* __VOLUME__ */ } diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h new file mode 100644 index 00000000000..496355bbc3a --- /dev/null +++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h @@ -0,0 +1,46 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_enqueue_inactive(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ +#ifdef __BRANCHED_PATH__ + /* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + char enqueue_flag = 0; + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) { + enqueue_flag = 1; + } + + enqueue_ray_index_local(ray_index, + QUEUE_INACTIVE_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif /* __BRANCHED_PATH__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 9fc853a84bf..fec671be016 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -52,6 +52,7 @@ CCL_NAMESPACE_BEGIN * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with * flag RAY_SHADOW_RAY_CAST_AO */ + ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( KernelGlobals *kg, ccl_local_param BackgroundAOLocals *locals) @@ -62,8 +63,9 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( } ccl_barrier(CCL_LOCAL_MEM_FENCE); +#ifdef __AO__ char enqueue_flag = 0; - char enqueue_flag_AO_SHADOW_RAY_CAST = 0; +#endif int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, @@ -122,14 +124,22 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( #ifdef __SHADOW_TRICKS__ if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { - if (state->flag & PATH_RAY_CAMERA) { - state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + if(state->flag & PATH_RAY_CAMERA) { + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + state->flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_SHADOW_CATCHER_ONLY | + PATH_RAY_STORE_SHADOW_INFO); state->catcher_object = sd->object; if(!kernel_data.background.transparent) { - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); + L->shadow_background_color = indirect_background( + kg, + &kernel_split_state.sd_DL_shadow[ray_index], + state, + ray); } + L->shadow_radiance_sum = path_radiance_clamp_and_sum(kg, L); + L->shadow_throughput = average(throughput); } } else { @@ -155,8 +165,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput); } if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); } } #endif /* __HOLDOUT__ */ @@ -164,18 +173,31 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - /* Holdout mask objects do not write data passes. */ - kernel_write_data_passes(kg, - buffer, - L, - sd, - sample, - state, - throughput); + +#ifdef __BRANCHED_PATH__ + if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) +#endif /* __BRANCHED_PATH__ */ + { + /* Holdout mask objects do not write data passes. */ + kernel_write_data_passes(kg, + buffer, + L, + sd, + sample, + state, + throughput); + } + /* Blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy. */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { +#ifndef __BRANCHED_PATH__ + if(kernel_data.integrator.filter_glossy != FLT_MAX) +#else + if(kernel_data.integrator.filter_glossy != FLT_MAX && + (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))) +#endif /* __BRANCHED_PATH__ */ + { float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; if(blur_pdf < 1.0f) { float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; @@ -201,85 +223,62 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate. */ +#ifndef __BRANCHED_PATH__ float probability = path_state_terminate_probability(kg, state, throughput); +#else + float probability = 1.0f; + + if(!kernel_data.integrator.branched) { + probability = path_state_terminate_probability(kg, state, throughput); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + int num_samples = kernel_split_state.branched_state[ray_index].num_samples; + probability = path_state_terminate_probability(kg, state, throughput*num_samples); + } + else if(state->flag & PATH_RAY_TRANSPARENT) { + probability = path_state_terminate_probability(kg, state, throughput); + } +#endif if(probability == 0.0f) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { if(probability != 1.0f) { float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE); if(terminate >= probability) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); } else { kernel_split_state.throughput[ray_index] = throughput/probability; } } + + kernel_update_denoising_features(kg, sd, state, L); } } #ifdef __AO__ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { /* ambient occlusion */ - if(kernel_data.integrator.use_ambient_occlusion || - (sd->flag & SD_AO)) - { - /* todo: solve correlation */ - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd); - - float3 ao_D; - float ao_pdf; - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray _ray; - _ray.P = ray_offset(sd->P, sd->Ng); - _ray.D = ao_D; - _ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - _ray.time = sd->time; -#endif - _ray.dP = sd->dP; - _ray.dD = differential3_zero(); - kernel_split_state.ao_light_ray[ray_index] = _ray; - - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); - enqueue_flag_AO_SHADOW_RAY_CAST = 1; - } + if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { + enqueue_flag = 1; } } #endif /* __AO__ */ - kernel_split_state.rng[ray_index] = rng; + kernel_split_state.rng[ray_index] = rng; #ifndef __COMPUTE_DEVICE_GPU__ } #endif - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - &locals->queue_atomics_bg, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - #ifdef __AO__ /* Enqueue to-shadow-ray-cast rays. */ enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, - enqueue_flag_AO_SHADOW_RAY_CAST, + enqueue_flag, kernel_split_params.queue_size, &locals->queue_atomics_ao, kernel_split_state.queue_data, diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h index 8192528622e..f0ebb90f60a 100644 --- a/intern/cycles/kernel/split/kernel_indirect_background.h +++ b/intern/cycles/kernel/split/kernel_indirect_background.h @@ -23,7 +23,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); int ray_index; - if(kernel_data.integrator.ao_bounces) { + if(kernel_data.integrator.ao_bounces != INT_MAX) { ray_index = get_ray_index(kg, thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, kernel_split_state.queue_data, @@ -34,7 +34,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; if(state->bounce > kernel_data.integrator.ao_bounces) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + kernel_split_path_end(kg, ray_index); } } } @@ -63,7 +63,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) #ifdef __PASSES__ if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) #endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + kernel_split_path_end(kg, ray_index); } if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { @@ -72,7 +72,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg) float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); path_radiance_accum_background(L, state, (*throughput), L_background); #endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + kernel_split_path_end(kg, ray_index); } } diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h index a56e85abeb9..82bc2f01fd7 100644 --- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h +++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h @@ -49,26 +49,29 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg) ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; - kernel_path_subsurface_accum_indirect(ss_indirect, L); +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { +#endif + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + kernel_path_subsurface_accum_indirect(ss_indirect, L); - /* Trace indirect subsurface rays by restarting the loop. this uses less - * stack memory than invoking kernel_path_indirect. - */ - if(ss_indirect->num_rays) { - kernel_path_subsurface_setup_indirect(kg, - ss_indirect, - state, - ray, - L, - throughput); - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - } - else { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + /* Trace indirect subsurface rays by restarting the loop. this uses less + * stack memory than invoking kernel_path_indirect. + */ + if(ss_indirect->num_rays) { + kernel_path_subsurface_setup_indirect(kg, + ss_indirect, + state, + ray, + L, + throughput); + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } } +#ifdef __BRANCHED_PATH__ } +#endif #endif /* __SUBSURFACE__ */ diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 1bebc16e25b..7758e35fd32 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -44,6 +44,52 @@ CCL_NAMESPACE_BEGIN * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays. */ + +#ifdef __BRANCHED_PATH__ +ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT); +} + +ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + + /* continue in case of transparency */ + *throughput *= shader_bsdf_transparency(kg, sd); + + if(is_zero(*throughput)) { + kernel_split_path_end(kg, ray_index); + } + else { + /* Update Path State */ + state->flag |= PATH_RAY_TRANSPARENT; + state->transparent_bounce++; + + ray->P = ray_offset(sd->P, -sd->Ng); + ray->t -= sd->ray_length; /* clipping works through transparent */ + +# ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD.dx = -sd->dI.dx; + ray->dD.dy = -sd->dI.dy; +# endif /* __RAY_DIFFERENTIALS__ */ + +# ifdef __VOLUME__ + /* enter/exit volume */ + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +# endif /* __VOLUME__ */ + } +} +#endif /* __BRANCHED_PATH__ */ + ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, ccl_local_param unsigned int *local_queue_atomics) { @@ -67,7 +113,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; } - char enqueue_flag = 0; int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, @@ -75,102 +120,127 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, kernel_split_params.queue_size, 0); -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - - /* Load ShaderData structure. */ - PathRadiance *L = NULL; - ccl_global PathState *state = NULL; ccl_global char *ray_state = kernel_split_state.ray_state; - /* Path radiance update for AO/Direct_lighting's shadow blocked. */ - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) - { - state = &kernel_split_state.path_state[ray_index]; - L = &kernel_split_state.path_radiance[ray_index]; - float3 _throughput = kernel_split_state.throughput[ray_index]; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - float3 shadow = kernel_split_state.ao_light_ray[ray_index].P; - // TODO(mai): investigate correctness here - char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t; - if(update_path_radiance) { - path_radiance_accum_ao(L, - _throughput, - kernel_split_state.ao_alpha[ray_index], - kernel_split_state.ao_bsdf[ray_index], - shadow, - state->bounce); - } - else { - path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]); + bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE); + if(active) { + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + /* Compute direct lighting and next bounce. */ + if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) { + kernel_split_path_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); +#ifdef __BRANCHED_PATH__ } - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - float3 shadow = kernel_split_state.light_ray[ray_index].P; - // TODO(mai): investigate correctness here - char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t; - BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; - if(update_path_radiance) { - path_radiance_accum_light(L, - _throughput, - &L_light, - shadow, - 1.0f, - state->bounce, - kernel_split_state.is_lamp[ray_index]); + else { + kernel_split_branched_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); } else { - path_radiance_accum_total_light(L, _throughput, &L_light); + kernel_split_branched_indirect_light_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); } +#endif /* __BRANCHED_PATH__ */ + + kernel_split_state.rng[ray_index] = rng; } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; - state = &kernel_split_state.path_state[ray_index]; - L = &kernel_split_state.path_radiance[ray_index]; + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#ifdef __BRANCHED_PATH__ + /* iter loop */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0; + } - /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_LIGHT_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + &kernel_split_state.branched_state[ray_index].sd, + true, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_branched_indirect_light_end(kg, ray_index); } - kernel_split_state.rng[ray_index] = rng; } -#ifndef __COMPUTE_DEVICE_GPU__ +# ifdef __VOLUME__ + /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; } -#endif + ccl_barrier(CCL_LOCAL_MEM_FENCE); - /* Enqueue RAY_UPDATE_BUFFER rays. */ + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, + QUEUE_VOLUME_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +# endif /* __VOLUME__ */ + +# ifdef __SUBSURFACE__ + /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_SUBSURFACE_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER), kernel_split_params.queue_size, local_queue_atomics, kernel_split_state.queue_data, kernel_split_params.queue_index); +# endif /* __SUBSURFACE__ */ +#endif /* __BRANCHED_PATH__ */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h index e2e841f36d3..66ce2dfb6f1 100644 --- a/intern/cycles/kernel/split/kernel_queue_enqueue.h +++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h @@ -51,7 +51,8 @@ ccl_device void kernel_queue_enqueue(KernelGlobals *kg, int queue_number = -1; if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) || - IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) { + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; } else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h index 5dc94caec85..45984ca509b 100644 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -43,11 +43,21 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg) } /* All regenerated rays become active here */ - if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) - ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { +#ifdef __BRANCHED_PATH__ + if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) { + kernel_split_path_end(kg, ray_index); + } + else +#endif /* __BRANCHED_PATH__ */ + { + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + } + } - if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) + if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { return; + } #ifdef __KERNEL_DEBUG__ DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index 0f1696e34a0..2801b32f285 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2015 Blender Foundation + * Copyright 2011-2017 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,54 +16,61 @@ CCL_NAMESPACE_BEGIN -/* This kernel sets up the ShaderData structure from the values computed +/* This kernel evaluates ShaderData structure from the values computed * by the previous kernels. - * - * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them - * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ -ccl_device void kernel_shader_eval(KernelGlobals *kg, - ccl_local_param unsigned int *local_queue_atomics) +ccl_device void kernel_shader_eval(KernelGlobals *kg) { - /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ - if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; - } - ccl_barrier(CCL_LOCAL_MEM_FENCE); int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + /* Sorting on cuda split is not implemented */ +#ifdef __KERNEL_CUDA__ + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; +#else + int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS]; +#endif + if(ray_index >= queue_index) { + return; + } ray_index = get_ray_index(kg, ray_index, +#ifdef __KERNEL_CUDA__ QUEUE_ACTIVE_AND_REGENERATED_RAYS, +#else + QUEUE_SHADER_SORTED_RAYS, +#endif kernel_split_state.queue_data, kernel_split_params.queue_size, 0); - char enqueue_flag = 0; - if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { - enqueue_flag = 1; + if(ray_index == QUEUE_EMPTY_SLOT) { + return; } - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); - - /* Continue on with shader evaluation. */ - if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { - Intersection isect = kernel_split_state.isect[ray_index]; + ccl_global char *ray_state = kernel_split_state.ray_state; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { RNG rng = kernel_split_state.rng[ray_index]; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - Ray ray = kernel_split_state.ray[ray_index]; - shader_setup_from_ray(kg, - &kernel_split_state.sd[ray_index], - &isect, - &ray); +#ifndef __BRANCHED_PATH__ float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); +#else + ShaderContext ctx = SHADER_CONTEXT_MAIN; + float rbsdf = 0.0f; + + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF); + + } + + if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + ctx = SHADER_CONTEXT_INDIRECT; + } + + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx); + shader_merge_closures(&kernel_split_state.sd[ray_index]); +#endif /* __BRANCHED_PATH__ */ + kernel_split_state.rng[ray_index] = rng; } } diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h new file mode 100644 index 00000000000..0432689d9fa --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_setup.h @@ -0,0 +1,70 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel sets up the ShaderData structure from the values computed + * by the previous kernels. + * + * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them + * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + */ +ccl_device void kernel_shader_setup(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(ray_index >= queue_index) { + return; + } + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + + /* Continue on with shader evaluation. */ + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + Intersection isect = kernel_split_state.isect[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; + + shader_setup_from_ray(kg, + &kernel_split_state.sd[ray_index], + &isect, + &ray); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h new file mode 100644 index 00000000000..297decb0bc2 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_sort.h @@ -0,0 +1,97 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + + +ccl_device void kernel_shader_sort(KernelGlobals *kg, + ccl_local_param ShaderSortLocals *locals) +{ +#ifndef __KERNEL_CUDA__ + int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(tid == 0) { + kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize; + } + + uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE; + if(offset >= qsize) { + return; + } + + int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); + uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size); + uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size); + ccl_local uint *local_value = &locals->local_value[0]; + ccl_local ushort *local_index = &locals->local_index[0]; + + /* copy to local memory */ + for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint add = input + idx; + uint value = (~0); + if(idx < qsize) { + int ray_index = kernel_split_state.queue_data[add]; + bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + if(valid) { + value = kernel_split_state.sd[ray_index].shader & SHADER_MASK; + } + } + local_value[i + lid] = value; + local_index[i + lid] = i + lid; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + /* skip sorting for cpu split kernel */ +# ifdef __KERNEL_OPENCL__ + + /* bitonic sort */ + for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { + for (uint inc = length; inc > 0; inc >>= 1) { + for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { + uint i = lid + ii; + bool direction = ((i & (length << 1)) != 0); + uint j = i ^ inc; + ushort ioff = local_index[i]; + ushort joff = local_index[j]; + uint iKey = local_value[ioff]; + uint jKey = local_value[joff]; + bool smaller = (jKey < iKey) || (jKey == iKey && j < i); + bool swap = smaller ^ (j < i) ^ direction; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + local_index[i] = (swap) ? joff : ioff; + local_index[j] = (swap) ? ioff : joff; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + } + } + } +# endif /* __KERNEL_OPENCL__ */ + + /* copy to destination */ + for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint lidx = local_index[i + lid]; + uint outi = output + idx; + uint ini = input + offset + lidx; + uint value = local_value[lidx]; + if(idx < qsize) { + kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini]; + } + } +#endif /* __KERNEL_CUDA__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h index 4243e18de72..474286285a9 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h @@ -29,31 +29,29 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg) kernel_split_state.queue_data, kernel_split_params.queue_size, 1); } - if(ray_index == QUEUE_EMPTY_SLOT) + if(ray_index == QUEUE_EMPTY_SLOT) { return; + } - /* Flag determining if we need to update L. */ - char update_path_radiance = 0; - - if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index]; - - float3 shadow; - Ray ray = *light_ray_global; - update_path_radiance = !(shadow_blocked(kg, - &kernel_split_state.sd_DL_shadow[ray_index], - state, - &ray, - &shadow)); - - *light_ray_global = ray; - /* We use light_ray_global's P and t to store shadow and - * update_path_radiance. - */ - light_ray_global->P = shadow; - light_ray_global->t = update_path_radiance; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd)); +#ifdef __BRANCHED_PATH__ + } + else { + kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput); } +#endif + + kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h index bb8f0157965..78e61709b01 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -29,31 +29,82 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) kernel_split_state.queue_data, kernel_split_params.queue_size, 1); } +#ifdef __BRANCHED_PATH__ + /* TODO(mai): move this somewhere else? */ + if(thread_index == 0) { + /* Clear QUEUE_INACTIVE_RAYS before next kernel. */ + kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0; + } +#endif /* __BRANCHED_PATH__ */ + if(ray_index == QUEUE_EMPTY_SLOT) return; - /* Flag determining if we need to update L. */ - char update_path_radiance = 0; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.light_ray[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + + BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + bool is_lamp = kernel_split_state.is_lamp[ray_index]; + +# if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__) + bool use_branched = false; + int all = 0; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + use_branched = true; + all = 1; + } +# if defined(__BRANCHED_PATH__) + else if(kernel_data.integrator.branched) { + use_branched = true; - if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index]; + if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + all = (kernel_data.integrator.sample_all_lights_indirect); + } + else + { + all = (kernel_data.integrator.sample_all_lights_direct); + } + } +# endif /* __BRANCHED_PATH__ */ + if(use_branched) { + kernel_branched_path_surface_connect_light(kg, + &rng, + sd, + emission_sd, + state, + throughput, + 1.0f, + L, + all); + } + else +# endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/ + { + /* trace shadow ray */ float3 shadow; - Ray ray = *light_ray_global; - update_path_radiance = !(shadow_blocked(kg, - &kernel_split_state.sd_DL_shadow[ray_index], - state, - &ray, - &shadow)); - - *light_ray_global = ray; - /* We use light_ray_global's P and t to store shadow and - * update_path_radiance. - */ - light_ray_global->P = shadow; - light_ray_global->t = update_path_radiance; + + if(!shadow_blocked(kg, + emission_sd, + state, + &ray, + &shadow)) + { + /* accumulate */ + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput, &L_light); + } } + + kernel_split_state.rng[ray_index] = rng; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 4303ba0a905..08f0124b529 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -37,41 +37,55 @@ #include "util/util_atomic.h" -#include "kernel/kernel_random.h" -#include "kernel/kernel_projection.h" -#include "kernel/kernel_montecarlo.h" -#include "kernel/kernel_differential.h" -#include "kernel/kernel_camera.h" - -#include "kernel/geom/geom.h" -#include "kernel/bvh/bvh.h" - -#include "kernel/kernel_accumulate.h" -#include "kernel/kernel_shader.h" -#include "kernel/kernel_light.h" -#include "kernel/kernel_passes.h" - -#ifdef __SUBSURFACE__ -# include "kernel/kernel_subsurface.h" +#include "kernel/kernel_path.h" +#ifdef __BRANCHED_PATH__ +# include "kernel/kernel_path_branched.h" #endif -#ifdef __VOLUME__ -# include "kernel/kernel_volume.h" -#endif +#include "kernel/kernel_queues.h" +#include "kernel/kernel_work_stealing.h" -#include "kernel/kernel_path_state.h" -#include "kernel/kernel_shadow.h" -#include "kernel/kernel_emission.h" -#include "kernel/kernel_path_common.h" -#include "kernel/kernel_path_surface.h" -#include "kernel/kernel_path_volume.h" -#include "kernel/kernel_path_subsurface.h" +#ifdef __BRANCHED_PATH__ +# include "kernel/split/kernel_branched.h" +#endif -#ifdef __KERNEL_DEBUG__ -# include "kernel/kernel_debug.h" +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + +#ifdef __BRANCHED_PATH__ + if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) { + int orig_ray = kernel_split_state.branched_state[ray_index].original_ray; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray]; + + path_radiance_sum_indirect(L); + path_radiance_accum_sample(orig_ray_L, L, 1); + + atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER); + } + else { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } +#else + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); #endif +} -#include "kernel/kernel_queues.h" -#include "kernel/kernel_work_stealing.h" +CCL_NAMESPACE_END #endif /* __KERNEL_SPLIT_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h index 17e6587883a..eac22050a38 100644 --- a/intern/cycles/kernel/split/kernel_split_data.h +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -31,14 +31,6 @@ ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_ size = size SPLIT_DATA_ENTRIES; #undef SPLIT_DATA_ENTRY -#ifdef __SUBSURFACE__ - size += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); /* ss_rays */ -#endif - -#ifdef __VOLUME__ - size += align_up(2 * num_elements * sizeof(PathState), 16); /* state_shadow */ -#endif - return size; } @@ -57,16 +49,6 @@ ccl_device_inline void split_data_init(KernelGlobals *kg, SPLIT_DATA_ENTRIES; #undef SPLIT_DATA_ENTRY -#ifdef __SUBSURFACE__ - split_data->ss_rays = (ccl_global SubsurfaceIndirectRays*)p; - p += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); -#endif - -#ifdef __VOLUME__ - split_data->state_shadow = (ccl_global PathState*)p; - p += align_up(2 * num_elements * sizeof(PathState), 16); -#endif - split_data->ray_state = ray_state; } diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index 748197b7183..4bb2f0d3d80 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -43,6 +43,9 @@ typedef struct SplitParams { ccl_global char *use_queues_flag; ccl_global float *buffer; + + /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */ + int dummy_sd_flag; } SplitParams; /* Global memory variables [porting]; These memory is used for @@ -59,7 +62,64 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(DebugData, debug_data, 1) #else # define SPLIT_DATA_DEBUG_ENTRIES -#endif +#endif /* DEBUG */ + +#ifdef __BRANCHED_PATH__ + +typedef ccl_global struct SplitBranchedState { + /* various state that must be kept and restored after an indirect loop */ + PathState path_state; + float3 throughput; + Ray ray; + + struct ShaderData sd; + Intersection isect; + + char ray_state; + + /* indirect loop state */ + int next_closure; + int next_sample; + int num_samples; + +#ifdef __SUBSURFACE__ + int ss_next_closure; + int ss_next_sample; + int next_hit; + int num_hits; + + uint lcg_state; + SubsurfaceIntersection ss_isect; + +# ifdef __VOLUME__ + VolumeStack volume_stack[VOLUME_STACK_SIZE]; +# endif /* __VOLUME__ */ +#endif /*__SUBSURFACE__ */ + + int shared_sample_count; /* number of branched samples shared with other threads */ + int original_ray; /* index of original ray when sharing branched samples */ + bool waiting_on_shared_samples; +} SplitBranchedState; + +#define SPLIT_DATA_BRANCHED_ENTRIES \ + SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1) +#else +#define SPLIT_DATA_BRANCHED_ENTRIES +#endif /* __BRANCHED_PATH__ */ + +#ifdef __SUBSURFACE__ +# define SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1) +#else +# define SPLIT_DATA_SUBSURFACE_ENTRIES +#endif /* __SUBSURFACE__ */ + +#ifdef __VOLUME__ +# define SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1) +#else +# define SPLIT_DATA_VOLUME_ENTRIES +#endif /* __VOLUME__ */ #define SPLIT_DATA_ENTRIES \ SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ @@ -69,9 +129,6 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ - SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \ - SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \ - SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \ SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ @@ -79,6 +136,28 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ + SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_BRANCHED_ENTRIES \ + SPLIT_DATA_DEBUG_ENTRIES \ + +/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */ +#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \ + SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ + SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ + SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \ + SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ + SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ + SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ + SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_BRANCHED_ENTRIES \ SPLIT_DATA_DEBUG_ENTRIES \ /* struct that holds pointers to data in the shared state buffer */ @@ -87,14 +166,6 @@ typedef struct SplitData { SPLIT_DATA_ENTRIES #undef SPLIT_DATA_ENTRY -#ifdef __SUBSURFACE__ - ccl_global SubsurfaceIndirectRays *ss_rays; -#endif - -#ifdef __VOLUME__ - ccl_global PathState *state_shadow; -#endif - /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from * the host easily) but is still used the same as the other data so we have it here in this struct as well */ @@ -122,6 +193,11 @@ typedef struct BackgroundAOLocals { uint queue_atomics_ao; } BackgroundAOLocals; +typedef struct ShaderSortLocals { + uint local_value[SHADER_SORT_BLOCK_SIZE]; + ushort local_index[SHADER_SORT_BLOCK_SIZE]; +} ShaderSortLocals; + CCL_NAMESPACE_END #endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */ diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h index 0b4d50c70ee..d5083b23f80 100644 --- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h +++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h @@ -16,82 +16,306 @@ CCL_NAMESPACE_BEGIN +#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__) -ccl_device void kernel_subsurface_scatter(KernelGlobals *kg, - ccl_local_param unsigned int* local_queue_atomics) +ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index) { -#ifdef __SUBSURFACE__ - if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { - *local_queue_atomics = 0; + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + branched_state->ss_next_closure = 0; + branched_state->ss_next_sample = 0; + + branched_state->num_hits = 0; + branched_state->next_hit = 0; + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = &branched_state->sd; + RNG rng = kernel_split_state.rng[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + + for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSSRDF(sc->type)) + continue; + + /* set up random number generator */ + if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 && + branched_state->next_closure == 0 && branched_state->next_sample == 0) + { + branched_state->lcg_state = lcg_state_init(&rng, + branched_state->path_state.rng_offset, + branched_state->path_state.sample, + 0x68bc21eb); + } + int num_samples = kernel_data.integrator.subsurface_samples; + float num_samples_inv = 1.0f/num_samples; + RNG bssrdf_rng = cmj_hash(rng, i); + + /* do subsurface scatter step with copy of shader data, this will + * replace the BSSRDF with a diffuse BSDF closure */ + for(int j = branched_state->ss_next_sample; j < num_samples; j++) { + ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect; + float bssrdf_u, bssrdf_v; + path_branched_rng_2D(kg, + &bssrdf_rng, + &branched_state->path_state, + j, + num_samples, + PRNG_BSDF_U, + &bssrdf_u, + &bssrdf_v); + + /* intersection is expensive so avoid doing multiple times for the same input */ + if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) { + RNG lcg_state = branched_state->lcg_state; + SubsurfaceIntersection ss_isect_private; + + branched_state->num_hits = subsurface_scatter_multi_intersect(kg, + &ss_isect_private, + sd, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + true); + + branched_state->lcg_state = lcg_state; + *ss_isect = ss_isect_private; + } + +#ifdef __VOLUME__ + Ray volume_ray = branched_state->ray; + bool need_update_volume_stack = + kernel_data.integrator.use_volumes && + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; +#endif /* __VOLUME__ */ + + /* compute lighting with the BSDF closure */ + for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) { + ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index]; + *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is + * important as the indirect path will write into bssrdf_sd */ + + SubsurfaceIntersection ss_isect_private = *ss_isect; + subsurface_scatter_multi_setup(kg, + &ss_isect_private, + hit, + bssrdf_sd, + &branched_state->path_state, + branched_state->path_state.flag, + sc, + true); + *ss_isect = ss_isect_private; + + ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index]; + *hit_state = branched_state->path_state; + + path_state_branch(hit_state, j, num_samples); + +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng); + volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t); + + /* this next part is expensive as it does scene intersection so only do once */ + if(branched_state->next_closure == 0 && branched_state->next_sample == 0) { + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + branched_state->volume_stack[k] = hit_state->volume_stack[k]; + } + + kernel_volume_stack_update_for_subsurface(kg, + emission_sd, + &volume_ray, + branched_state->volume_stack); + } + + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + hit_state->volume_stack[k] = branched_state->volume_stack[k]; + } + } +#endif /* __VOLUME__ */ + +#ifdef __EMISSION__ + if(branched_state->next_closure == 0 && branched_state->next_sample == 0) { + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + int all = (kernel_data.integrator.sample_all_lights_direct) || + (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER); + kernel_branched_path_surface_connect_light(kg, + &rng, + bssrdf_sd, + emission_sd, + hit_state, + branched_state->throughput, + num_samples_inv, + L, + all); + } + } +#endif /* __EMISSION__ */ + + /* indirect light */ + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + num_samples_inv, + bssrdf_sd, + false, + false)) + { + branched_state->ss_next_closure = i; + branched_state->ss_next_sample = j; + branched_state->next_hit = hit; + + return true; + } + + branched_state->next_closure = 0; + } + + branched_state->next_hit = 0; + } + + branched_state->ss_next_sample = 0; + } + + branched_state->ss_next_closure = sd->num_closure; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + return false; +} + +#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */ + +ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) +{ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index == 0) { + /* We will empty both queues in this kernel. */ + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; } - ccl_barrier(CCL_LOCAL_MEM_FENCE); int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); ray_index = get_ray_index(kg, ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, kernel_split_state.queue_data, kernel_split_params.queue_size, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - - char enqueue_flag = 0; - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif + 1); + get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); +#ifdef __SUBSURFACE__ ccl_global char *ray_state = kernel_split_state.ray_state; - ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; - RNG rng = kernel_split_state.rng[ray_index]; - ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; - ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; - ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; - ShaderData *sd = &kernel_split_state.sd[ray_index]; - ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; + if(sd->flag & SD_BSSRDF) { - if(kernel_path_subsurface_scatter(kg, - sd, - emission_sd, - L, - state, - &rng, - ray, - throughput, - ss_indirect)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched) { +#endif + if(kernel_path_subsurface_scatter(kg, + sd, + emission_sd, + L, + state, + &rng, + ray, + throughput, + ss_indirect)) + { + kernel_split_path_end(kg, ray_index); + } +#ifdef __BRANCHED_PATH__ + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + float bssrdf_probability; + ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + + /* modify throughput for picking bssrdf or bsdf */ + *throughput *= bssrdf_probability; + + /* do bssrdf scatter step if we picked a bssrdf closure */ + if(sc) { + uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb); + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, + &rng, + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); + subsurface_scatter_step(kg, + sd, + state, + state->flag, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + false); + } + } + else { + kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } } +#endif } kernel_split_state.rng[ray_index] = rng; } -#ifndef __COMPUTE_DEVICE_GPU__ +# ifdef __BRANCHED_PATH__ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0; } -#endif - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - kernel_split_params.queue_size, - local_queue_atomics, - kernel_split_state.queue_data, - kernel_split_params.queue_index); + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_SUBSURFACE_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ #endif /* __SUBSURFACE__ */ diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 1885e1af851..4268813b263 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -76,6 +76,345 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w); switch(type) { +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_ID: { + uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset, + sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset, + anisotropic_rotation_offset, transmission_roughness_offset; + uint4 data_node2 = read_node(kg, offset); + + float3 T = stack_load_float3(stack, data_node.y); + decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset); + decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset); + decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset); + + // get Disney principled parameters + float metallic = param1; + float subsurface = param2; + float specular = stack_load_float(stack, specular_offset); + float roughness = stack_load_float(stack, roughness_offset); + float specular_tint = stack_load_float(stack, specular_tint_offset); + float anisotropic = stack_load_float(stack, anisotropic_offset); + float sheen = stack_load_float(stack, sheen_offset); + float sheen_tint = stack_load_float(stack, sheen_tint_offset); + float clearcoat = stack_load_float(stack, clearcoat_offset); + float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset); + float transmission = stack_load_float(stack, transmission_offset); + float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset); + float transmission_roughness = stack_load_float(stack, transmission_roughness_offset); + float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f); + + ClosureType distribution = stack_valid(data_node2.y) ? (ClosureType) data_node2.y : CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID; + + /* rotate tangent */ + if(anisotropic_rotation != 0.0f) + T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F); + + /* calculate ior */ + float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta; + + // calculate fresnel for refraction + float cosNO = dot(N, sd->I); + float fresnel = fresnel_dielectric_cos(cosNO, ior); + + // calculate weights of the diffuse and specular part + float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission)); + + float final_transmission = saturate(transmission) * (1.0f - saturate(metallic)); + float specular_weight = (1.0f - final_transmission); + + // get the base color + uint4 data_base_color = read_node(kg, offset); + float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) : + make_float3(__uint_as_float(data_base_color.y), __uint_as_float(data_base_color.z), __uint_as_float(data_base_color.w)); + + // get the additional clearcoat normal and subsurface scattering radius + uint4 data_cn_ssr = read_node(kg, offset); + float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N; + float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f); + + // get the subsurface color + uint4 data_subsurface_color = read_node(kg, offset); + float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) : + make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w)); + + float3 weight = sd->svm_closure_weight * mix_weight; + +#ifdef __SUBSURFACE__ + float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface); + float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight; + float subsurf_sample_weight = fabsf(average(subsurf_weight)); + + /* disable in case of diffuse ancestor, can't see it well then and + * adds considerably noise due to probabilities of continuing path + * getting lower and lower */ + if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) { + subsurface = 0.0f; + + /* need to set the base color in this case such that the + * rays get the correctly mixed color after transmitting + * the object */ + base_color = mixed_ss_base_color; + } + + /* diffuse */ + if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) { + if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { + float3 diff_weight = weight * base_color * diffuse_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + } + } + else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) { + /* radius * scale */ + float3 radius = subsurface_radius * subsurface; + /* sharpness */ + float sharpness = 0.0f; + /* texture color blur */ + float texture_blur = 0.0f; + + /* create one closure per color channel */ + Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(subsurf_weight.x, 0.0f, 0.0f)); + if(bssrdf) { + bssrdf->sample_weight = subsurf_sample_weight; + bssrdf->radius = radius.x; + bssrdf->texture_blur = texture_blur; + bssrdf->albedo = subsurface_color.x; + bssrdf->sharpness = sharpness; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + } + + bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f)); + if(bssrdf) { + bssrdf->sample_weight = subsurf_sample_weight; + bssrdf->radius = radius.y; + bssrdf->texture_blur = texture_blur; + bssrdf->albedo = subsurface_color.y; + bssrdf->sharpness = sharpness; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + } + + bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z)); + if(bssrdf) { + bssrdf->sample_weight = subsurf_sample_weight; + bssrdf->radius = radius.z; + bssrdf->texture_blur = texture_blur; + bssrdf->albedo = subsurface_color.z; + bssrdf->sharpness = sharpness; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID); + } + } + } +#else + /* diffuse */ + if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { + float3 diff_weight = weight * base_color * diffuse_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + } + } +#endif + + /* sheen */ + if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) { + float m_cdlum = linear_rgb_to_gray(base_color); + float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(1.0f, 1.0f, 1.0f); // normalize lum. to isolate hue+sat + + /* color of the sheen component */ + float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) + m_ctint * sheen_tint; + + float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight; + + PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf*)bsdf_alloc(sd, sizeof(PrincipledSheenBsdf), sheen_weight); + + if(bsdf) { + bsdf->N = N; + + /* setup bsdf */ + sd->flag |= bsdf_principled_sheen_setup(bsdf); + } + } + + /* specular reflection */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(specular_weight > CLOSURE_WEIGHT_CUTOFF && (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) { + float3 spec_weight = weight * specular_weight; + + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), spec_weight); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = N; + bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f; + bsdf->T = T; + bsdf->extra = extra; + + float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f); + float r2 = roughness * roughness; + + bsdf->alpha_x = r2 / aspect; + bsdf->alpha_y = r2 * aspect; + + float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx. + float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat + float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) + m_ctint * specular_tint; + + bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic; + bsdf->extra->color = base_color; + + /* setup bsdf */ + if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */ + sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd); + else /* use multi-scatter GGX */ + sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd); + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + /* BSDF */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(final_transmission > CLOSURE_WEIGHT_CUTOFF) { + float3 glass_weight = weight * final_transmission; + float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint); + + if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */ + float refl_roughness = roughness; + + /* reflection */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight*fresnel); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = N; + bsdf->extra = extra; + + bsdf->alpha_x = refl_roughness * refl_roughness; + bsdf->alpha_y = refl_roughness * refl_roughness; + bsdf->ior = ior; + + bsdf->extra->color = base_color; + bsdf->extra->cspec0 = cspec0; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd); + } + } + + /* refraction */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), base_color*glass_weight*(1.0f - fresnel)); + + if(bsdf) { + bsdf->N = N; + + if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) + transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness); + else + transmission_roughness = refl_roughness; + + bsdf->alpha_x = transmission_roughness * transmission_roughness; + bsdf->alpha_y = transmission_roughness * transmission_roughness; + bsdf->ior = ior; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); + } + } + } + else { /* use multi-scatter GGX */ + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = N; + bsdf->extra = extra; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + + bsdf->alpha_x = roughness * roughness; + bsdf->alpha_y = roughness * roughness; + bsdf->ior = ior; + + bsdf->extra->color = base_color; + bsdf->extra->cspec0 = cspec0; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd); + } + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + /* clearcoat */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(clearcoat > CLOSURE_WEIGHT_CUTOFF) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + + if(bsdf && extra) { + bsdf->N = clearcoat_normal; + bsdf->ior = 1.5f; + bsdf->extra = extra; + + bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness; + + bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); + bsdf->extra->clearcoat = clearcoat; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd); + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + break; + } +#endif /* __PRINCIPLED__ */ case CLOSURE_BSDF_DIFFUSE_ID: { float3 weight = sd->svm_closure_weight * mix_weight; OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight); @@ -110,6 +449,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { + bsdf->N = N; sd->flag |= bsdf_transparent_setup(bsdf); } break; @@ -344,6 +684,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #ifdef __CAUSTICS_TRICKS__ if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; + ATTR_FALLTHROUGH; #endif case CLOSURE_BSDF_DIFFUSE_TOON_ID: { float3 weight = sd->svm_closure_weight * mix_weight; @@ -370,6 +711,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { + bsdf->N = N; /* todo: giving a fixed weight here will cause issues when * mixing multiple BSDFS. energy will not be conserved and * the throughput can blow up after multiple bounces. we @@ -383,6 +725,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight); if(bsdf) { + bsdf->N = N; bsdf->roughness1 = param1; bsdf->roughness2 = param2; bsdf->offset = -stack_load_float(stack, data_node.z); diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index c94fa130af7..656357be52d 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac strength = max(strength, 0.0f); /* compute and output perturbed normal */ - float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad); - normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad); + if(is_zero(normal_out)) { + normal_out = normal_in; + } + else { + normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + } if(use_object_space) { object_normal_transform(kg, sd, &normal_out); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 4a09d9f6653..cce4e89e715 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg, #ifdef __UV__ case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; #endif + default: data = make_float3(0.0f, 0.0f, 0.0f); } stack_store_float3(stack, out_offset, data); diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 76acc9253a1..7be03dcd65a 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -16,29 +16,10 @@ CCL_NAMESPACE_BEGIN -/* Float4 textures on various devices. */ -#if defined(__KERNEL_CPU__) -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU -#elif defined(__KERNEL_CUDA__) -# if __CUDA_ARCH__ < 300 -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA -# else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA_KEPLER -# endif -#else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_OPENCL -#endif - ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha) { #ifdef __KERNEL_CPU__ -# ifdef __KERNEL_SSE2__ - ssef r_ssef; - float4 &r = (float4 &)r_ssef; - r = kernel_tex_image_interp(id, x, y); -# else float4 r = kernel_tex_image_interp(id, x, y); -# endif #elif defined(__KERNEL_OPENCL__) float4 r = kernel_tex_image_interp(kg, id, x, y); #else @@ -56,94 +37,94 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, switch(id) { case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break; - case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break; - case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break; - case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break; - case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break; - case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break; - case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break; - case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break; - case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break; + case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break; + case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break; + case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break; + case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break; + case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break; case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break; - case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break; - case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break; - case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break; - case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break; - case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break; - case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break; - case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break; case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break; - case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break; - case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break; - case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break; - case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break; - case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break; - case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break; - case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break; case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break; - case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break; - case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break; - case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break; - case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break; - case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break; - case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break; - case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break; case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break; - case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break; - case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break; - case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break; - case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break; - case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break; - case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break; - case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break; case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break; - case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break; - case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break; - case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break; - case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break; - case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break; - case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break; - case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break; case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break; - case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break; - case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break; - case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break; - case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break; - case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break; - case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break; - case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break; case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break; - case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break; - case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break; - case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break; - case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break; - case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break; - case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break; - case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break; case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break; - case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break; - case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break; - case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break; - case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break; - case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break; - case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break; - case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break; case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break; - case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break; - case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break; - case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break; - case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break; - case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break; - case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break; - case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break; case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break; - case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break; - case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break; - case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break; - case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break; - case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break; - case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break; - case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break; + case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break; + case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break; + case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break; + case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break; + case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break; + case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break; + case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break; + case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break; + case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break; + case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break; + case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break; + case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break; + case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break; + case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break; + case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break; + case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break; + case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break; + case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break; + case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break; + case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break; + case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break; + case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break; + case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break; + case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break; + case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break; + case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break; + case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break; + case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break; + case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break; + case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break; + case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break; + case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break; + case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break; + case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break; + case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break; + case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break; + case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break; + case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break; + case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break; + case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break; + case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break; + case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break; + case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break; + case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break; + case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break; + case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break; + case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break; + case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break; + case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break; + case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break; + case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break; + case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break; + case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break; + case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break; + case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break; + case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break; + case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break; + case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break; + case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break; + case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break; + case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break; + case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break; + case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break; + case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break; + case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break; + case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break; + case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break; + case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break; + case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break; + case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break; + case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break; + case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break; + case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break; default: kernel_assert(0); return make_float4(0.0f, 0.0f, 0.0f, 0.0f); @@ -151,8 +132,13 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, # else CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); /* float4, byte4 and half4 */ - if(id < TEX_START_FLOAT_CUDA_KEPLER) + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { r = kernel_tex_image_interp_float4(tex, x, y); + } /* float, byte and half */ else { float f = kernel_tex_image_interp_float(tex, x, y); @@ -161,40 +147,22 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, # endif #endif -#ifdef __KERNEL_SSE2__ - float alpha = r.w; + const float alpha = r.w; if(use_alpha && alpha != 1.0f && alpha != 0.0f) { - r_ssef = r_ssef / ssef(alpha); - if(id >= TEX_NUM_FLOAT4_IMAGES) - r_ssef = min(r_ssef, ssef(1.0f)); - r.w = alpha; - } - - if(srgb) { - r_ssef = color_srgb_to_scene_linear(r_ssef); - r.w = alpha; - } -#else - if(use_alpha && r.w != 1.0f && r.w != 0.0f) { - float invw = 1.0f/r.w; - r.x *= invw; - r.y *= invw; - r.z *= invw; - - if(id >= TEX_NUM_FLOAT4_IMAGES) { - r.x = min(r.x, 1.0f); - r.y = min(r.y, 1.0f); - r.z = min(r.z, 1.0f); + r /= alpha; + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_BYTE) + { + r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f)); } + r.w = alpha; } if(srgb) { - r.x = color_srgb_to_scene_linear(r.x); - r.y = color_srgb_to_scene_linear(r.y); - r.z = color_srgb_to_scene_linear(r.z); + r = color_srgb_to_scene_linear_v4(r); } -#endif return r; } @@ -336,8 +304,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa float3 co = stack_load_float3(stack, co_offset); float2 uv; - co = normalize(co); - + co = safe_normalize(co); + if(projection == 0) uv = direction_to_equirectangular(co); else diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index 47209ddfbab..d859cae1708 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -397,17 +397,23 @@ typedef enum ClosureType { CLOSURE_BSDF_DIFFUSE_ID, CLOSURE_BSDF_OREN_NAYAR_ID, CLOSURE_BSDF_DIFFUSE_RAMP_ID, + CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID, + CLOSURE_BSDF_PRINCIPLED_SHEEN_ID, CLOSURE_BSDF_DIFFUSE_TOON_ID, /* Glossy */ - CLOSURE_BSDF_GLOSSY_ID, CLOSURE_BSDF_REFLECTION_ID, CLOSURE_BSDF_MICROFACET_GGX_ID, + CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID, + CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID, CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID, CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_VELVET_ID, @@ -416,24 +422,26 @@ typedef enum ClosureType { CLOSURE_BSDF_HAIR_REFLECTION_ID, /* Transmission */ - CLOSURE_BSDF_TRANSMISSION_ID, CLOSURE_BSDF_TRANSLUCENT_ID, CLOSURE_BSDF_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID, CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID, - CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID, CLOSURE_BSDF_SHARP_GLASS_ID, CLOSURE_BSDF_HAIR_TRANSMISSION_ID, /* Special cases */ CLOSURE_BSDF_BSSRDF_ID, + CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID, CLOSURE_BSDF_TRANSPARENT_ID, /* BSSRDF */ CLOSURE_BSSRDF_CUBIC_ID, CLOSURE_BSSRDF_GAUSSIAN_ID, + CLOSURE_BSSRDF_PRINCIPLED_ID, CLOSURE_BSSRDF_BURLEY_ID, /* Other */ @@ -447,19 +455,24 @@ typedef enum ClosureType { CLOSURE_VOLUME_ABSORPTION_ID, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, + CLOSURE_BSDF_PRINCIPLED_ID, + NBUILTIN_CLOSURES } ClosureType; /* watch this, being lazy with memory usage */ #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID) -#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) -#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) -#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID) +#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) +#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) +#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID) +#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \ - type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) + type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) +#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\ + (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)) #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) @@ -468,7 +481,8 @@ typedef enum ClosureType { #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID) #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID) #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) -#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID) #define CLOSURE_WEIGHT_CUTOFF 1e-5f diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h index 9e826c8c23f..f4a5b2b2994 100644 --- a/intern/cycles/kernel/svm/svm_voxel.h +++ b/intern/cycles/kernel/svm/svm_voxel.h @@ -46,8 +46,13 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg, # if defined(__KERNEL_CUDA__) # if __CUDA_ARCH__ >= 300 CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); - if(id < TEX_START_HALF4_CUDA_KEPLER) + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z); + } else { float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z); r = make_float4(f, f, f, 1.0f); diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index fe2c2e78926..cf402c3f214 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -42,6 +42,9 @@ BufferParams::BufferParams() full_width = 0; full_height = 0; + denoising_data_pass = false; + denoising_clean_pass = false; + Pass::add(PASS_COMBINED, passes); } @@ -68,10 +71,25 @@ int BufferParams::get_passes_size() for(size_t i = 0; i < passes.size(); i++) size += passes[i].components; - + + if(denoising_data_pass) { + size += DENOISING_PASS_SIZE_BASE; + if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN; + } + return align_up(size, 4); } +int BufferParams::get_denoising_offset() +{ + int offset = 0; + + for(size_t i = 0; i < passes.size(); i++) + offset += passes[i].components; + + return offset; +} + /* Render Buffer Task */ RenderTile::RenderTile() @@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_) device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE); } -bool RenderBuffers::copy_from_device() +bool RenderBuffers::copy_from_device(Device *from_device) { if(!buffer.device_pointer) return false; - device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float)); + if(!from_device) { + from_device = device; + } + + from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float)); + + return true; +} + +bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels) +{ + float scale = 1.0f/sample; + + if(offset == DENOISING_PASS_COLOR) { + scale *= exposure; + } + else if(offset == DENOISING_PASS_COLOR_VAR) { + scale *= exposure*exposure; + } + + offset += params.get_denoising_offset(); + float *in = (float*)buffer.data_pointer + offset; + int pass_stride = params.get_passes_size(); + int size = params.width*params.height; + + if(components == 1) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + pixels[0] = in[0]*scale; + } + } + else if(components == 3) { + for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { + pixels[0] = in[0]*scale; + pixels[1] = in[1]*scale; + pixels[2] = in[2]*scale; + } + } + else { + return false; + } return true; } diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index 5c78971678a..e56556c8abe 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -51,6 +51,9 @@ public: /* passes */ array<Pass> passes; + bool denoising_data_pass; + /* If only some light path types should be denoised, an additional pass is needed. */ + bool denoising_clean_pass; /* functions */ BufferParams(); @@ -59,6 +62,7 @@ public: bool modified(const BufferParams& params); void add_pass(PassType type); int get_passes_size(); + int get_denoising_offset(); }; /* Render Buffers */ @@ -73,18 +77,19 @@ public: /* random number generator state */ device_vector<uint> rng_state; + Device *device; + explicit RenderBuffers(Device *device); ~RenderBuffers(); void reset(Device *device, BufferParams& params); - bool copy_from_device(); + bool copy_from_device(Device *from_device = NULL); bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels); + bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels); protected: void device_free(); - - Device *device; }; /* Display Buffer @@ -131,6 +136,9 @@ protected: class RenderTile { public: + typedef enum { PATH_TRACE, DENOISE } Task; + + Task task; int x, y, w, h; int start_sample; int num_samples; @@ -138,6 +146,7 @@ public: int resolution; int offset; int stride; + int tile_index; device_ptr buffer; device_ptr rng_state; diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp index 2569d9eec27..943b218f0e4 100644 --- a/intern/cycles/render/constant_fold.cpp +++ b/intern/cycles/render/constant_fold.cpp @@ -160,6 +160,14 @@ bool ConstantFolder::try_bypass_or_make_constant(ShaderInput *input, bool clamp) bypass(input->link); return true; } + else { + /* disconnect other inputs if we can't fully bypass due to clamp */ + foreach(ShaderInput *other, node->inputs) { + if(other != input && other->link) { + graph->disconnect(other); + } + } + } return false; } diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 7809f4345f1..c8213d258d5 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -279,6 +279,10 @@ NODE_DEFINE(Film) SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false); + SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false); + SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false); + SOCKET_INT(denoising_flags, "Denoising Flags", 0); + return type; } @@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_stride += pass.components; } + kfilm->pass_denoising_data = 0; + kfilm->pass_denoising_clean = 0; + kfilm->denoising_flags = 0; + if(denoising_data_pass) { + kfilm->pass_denoising_data = kfilm->pass_stride; + kfilm->pass_stride += DENOISING_PASS_SIZE_BASE; + kfilm->denoising_flags = denoising_flags; + if(denoising_clean_pass) { + kfilm->pass_denoising_clean = kfilm->pass_stride; + kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN; + kfilm->use_light_pass = 1; + } + } + kfilm->pass_stride = align_up(kfilm->pass_stride, 4); kfilm->pass_alpha_threshold = pass_alpha_threshold; @@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f; kfilm->mist_falloff = mist_falloff; + pass_stride = kfilm->pass_stride; + denoising_data_offset = kfilm->pass_denoising_data; + denoising_clean_offset = kfilm->pass_denoising_clean; + need_update = false; } diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index 83c941d5c57..29b1e7e9157 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -57,8 +57,15 @@ public: float exposure; array<Pass> passes; + bool denoising_data_pass; + bool denoising_clean_pass; + int denoising_flags; float pass_alpha_threshold; + int pass_stride; + int denoising_data_offset; + int denoising_clean_offset; + FilterType filter_type; float filter_width; size_t filter_table_offset; diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp index 12fff8e5587..2d810ff664f 100644 --- a/intern/cycles/render/graph.cpp +++ b/intern/cycles/render/graph.cpp @@ -423,7 +423,8 @@ void ShaderGraph::copy_nodes(ShaderNodeSet& nodes, ShaderNodeMap& nnodemap) /* Graph simplification */ /* ******************** */ -/* Step 1: Remove proxy nodes. +/* Remove proxy nodes. + * * These only exists temporarily when exporting groups, and we must remove them * early so that node->attributes() and default links do not see them. */ @@ -493,7 +494,8 @@ void ShaderGraph::remove_proxy_nodes() } } -/* Step 2: Constant folding. +/* Constant folding. + * * Try to constant fold some nodes, and pipe result directly to * the input socket of connected nodes. */ @@ -554,7 +556,7 @@ void ShaderGraph::constant_fold() } } -/* Step 3: Simplification. */ +/* Simplification. */ void ShaderGraph::simplify_settings(Scene *scene) { foreach(ShaderNode *node, nodes) { @@ -562,7 +564,7 @@ void ShaderGraph::simplify_settings(Scene *scene) } } -/* Step 4: Deduplicate nodes with same settings. */ +/* Deduplicate nodes with same settings. */ void ShaderGraph::deduplicate_nodes() { /* NOTES: @@ -638,6 +640,48 @@ void ShaderGraph::deduplicate_nodes() } } +/* Check whether volume output has meaningful nodes, otherwise + * disconnect the output. + */ +void ShaderGraph::verify_volume_output() +{ + /* Check whether we can optimize the whole volume graph out. */ + ShaderInput *volume_in = output()->input("Volume"); + if(volume_in->link == NULL) { + return; + } + bool has_valid_volume = false; + ShaderNodeSet scheduled; + queue<ShaderNode*> traverse_queue; + /* Schedule volume output. */ + traverse_queue.push(volume_in->link->parent); + scheduled.insert(volume_in->link->parent); + /* Traverse down the tree. */ + while(!traverse_queue.empty()) { + ShaderNode *node = traverse_queue.front(); + traverse_queue.pop(); + /* Node is fully valid for volume, can't optimize anything out. */ + if(node->has_volume_support()) { + has_valid_volume = true; + break; + } + foreach(ShaderInput *input, node->inputs) { + if(input->link == NULL) { + continue; + } + if(scheduled.find(input->link->parent) != scheduled.end()) { + continue; + } + traverse_queue.push(input->link->parent); + scheduled.insert(input->link->parent); + } + } + if(!has_valid_volume) { + VLOG(1) << "Disconnect meaningless volume output."; + disconnect(volume_in->link); + } +} + void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<bool>& on_stack) { visited[node->id] = true; @@ -666,16 +710,11 @@ void ShaderGraph::clean(Scene *scene) { /* Graph simplification */ - /* 1: Remove proxy nodes was already done. */ - - /* 2: Constant folding. */ + /* NOTE: Remove proxy nodes was already done. */ constant_fold(); - - /* 3: Simplification. */ simplify_settings(scene); - - /* 4: De-duplication. */ deduplicate_nodes(); + verify_volume_output(); /* we do two things here: find cycles and break them, and remove unused * nodes that don't feed into the output. how cycles are broken is @@ -998,6 +1037,9 @@ int ShaderGraph::get_num_closures() else if(CLOSURE_IS_BSDF_MULTISCATTER(closure_type)) { num_closures += 2; } + else if(CLOSURE_IS_PRINCIPLED(closure_type)) { + num_closures += 8; + } else { ++num_closures; } diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h index 09932695d1f..72e391991a7 100644 --- a/intern/cycles/render/graph.h +++ b/intern/cycles/render/graph.h @@ -155,7 +155,7 @@ public: virtual bool has_spatial_varying() { return false; } virtual bool has_object_dependency() { return false; } virtual bool has_integrator_dependency() { return false; } - + virtual bool has_volume_support() { return false; } vector<ShaderInput*> inputs; vector<ShaderOutput*> outputs; @@ -284,6 +284,7 @@ protected: void constant_fold(); void simplify_settings(Scene *scene); void deduplicate_nodes(); + void verify_volume_output(); }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index a8c4f446bea..02b65440154 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -30,6 +30,16 @@ CCL_NAMESPACE_BEGIN +/* Some helpers to silence warning in templated function. */ +static bool isfinite(uchar /*value*/) +{ + return false; +} +static bool isfinite(half /*value*/) +{ + return false; +} + ImageManager::ImageManager(const DeviceInfo& info) { need_update = true; @@ -49,54 +59,24 @@ ImageManager::ImageManager(const DeviceInfo& info) } /* Set image limits */ -#define SET_TEX_IMAGES_LIMITS(ARCH) \ - { \ - tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_HALF4] = TEX_NUM_HALF4_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_BYTE] = TEX_NUM_BYTE_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_HALF] = TEX_NUM_HALF_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_START_FLOAT4_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_BYTE4] = TEX_START_BYTE4_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_HALF4] = TEX_START_HALF4_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_FLOAT] = TEX_START_FLOAT_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_BYTE] = TEX_START_BYTE_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_HALF] = TEX_START_HALF_ ## ARCH; \ - } - - if(device_type == DEVICE_CPU) { - SET_TEX_IMAGES_LIMITS(CPU); - } - else if(device_type == DEVICE_CUDA) { - if(info.has_bindless_textures) { - SET_TEX_IMAGES_LIMITS(CUDA_KEPLER); - } - else { - SET_TEX_IMAGES_LIMITS(CUDA); + max_num_images = TEX_NUM_MAX; + has_half_images = true; + cuda_fermi_limits = false; + + if(device_type == DEVICE_CUDA) { + if(!info.has_bindless_textures) { + /* CUDA Fermi hardware (SM 2.x) has a hard limit on the number of textures */ + cuda_fermi_limits = true; + has_half_images = false; } } else if(device_type == DEVICE_OPENCL) { - SET_TEX_IMAGES_LIMITS(OPENCL); - } - else { - /* Should not happen. */ - tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = 0; - tex_num_images[IMAGE_DATA_TYPE_BYTE4] = 0; - tex_num_images[IMAGE_DATA_TYPE_HALF4] = 0; - tex_num_images[IMAGE_DATA_TYPE_FLOAT] = 0; - tex_num_images[IMAGE_DATA_TYPE_BYTE] = 0; - tex_num_images[IMAGE_DATA_TYPE_HALF] = 0; - tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = 0; - tex_start_images[IMAGE_DATA_TYPE_BYTE4] = 0; - tex_start_images[IMAGE_DATA_TYPE_HALF4] = 0; - tex_start_images[IMAGE_DATA_TYPE_FLOAT] = 0; - tex_start_images[IMAGE_DATA_TYPE_BYTE] = 0; - tex_start_images[IMAGE_DATA_TYPE_HALF] = 0; - assert(0); + has_half_images = false; } -#undef SET_TEX_IMAGES_LIMITS + for(size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { + tex_num_images[type] = 0; + } } ImageManager::~ImageManager() @@ -133,18 +113,20 @@ bool ImageManager::set_animation_frame_update(int frame) return false; } -ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filename, - void *builtin_data, - bool& is_linear) +ImageDataType ImageManager::get_image_metadata(const string& filename, + void *builtin_data, + bool& is_linear, + bool& builtin_free_cache) { bool is_float = false, is_half = false; is_linear = false; + builtin_free_cache = false; int channels = 4; if(builtin_data) { if(builtin_image_info_cb) { int width, height, depth; - builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels); + builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels, builtin_free_cache); } if(is_float) { @@ -226,26 +208,28 @@ ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filen } } -/* We use a consecutive slot counting scheme on the devices, in order - * float4, byte4, half4, float, byte, half. +int ImageManager::max_flattened_slot(ImageDataType type) +{ + if(tex_num_images[type] == 0) { + /* No textures for the type, no slots needs allocation. */ + return 0; + } + return type_index_to_flattened_slot(tex_num_images[type], type); +} + +/* The lower three bits of a device texture slot number indicate its type. * These functions convert the slot ids from ImageManager "images" ones - * to device ones and vice versa. */ + * to device ones and vice verse. + */ int ImageManager::type_index_to_flattened_slot(int slot, ImageDataType type) { - return slot + tex_start_images[type]; + return (slot << IMAGE_DATA_TYPE_SHIFT) | (type); } int ImageManager::flattened_slot_to_type_index(int flat_slot, ImageDataType *type) { - for(int i = IMAGE_DATA_NUM_TYPES - 1; i >= 0; i--) { - if(flat_slot >= tex_start_images[i]) { - *type = (ImageDataType)i; - return flat_slot - tex_start_images[i]; - } - } - - /* Should not happen. */ - return flat_slot; + *type = (ImageDataType)(flat_slot & IMAGE_DATA_TYPE_MASK); + return flat_slot >> IMAGE_DATA_TYPE_SHIFT; } string ImageManager::name_from_type(int type) @@ -290,8 +274,9 @@ int ImageManager::add_image(const string& filename, { Image *img; size_t slot; + bool builtin_free_cache; - ImageDataType type = get_image_metadata(filename, builtin_data, is_linear); + ImageDataType type = get_image_metadata(filename, builtin_data, is_linear, builtin_free_cache); thread_scoped_lock device_lock(device_mutex); @@ -299,14 +284,22 @@ int ImageManager::add_image(const string& filename, is_float = (type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4); /* No single channel and half textures on CUDA (Fermi) and no half on OpenCL, use available slots */ - if((type == IMAGE_DATA_TYPE_FLOAT || - type == IMAGE_DATA_TYPE_HALF4 || - type == IMAGE_DATA_TYPE_HALF) && - tex_num_images[type] == 0) { - type = IMAGE_DATA_TYPE_FLOAT4; + if(!has_half_images) { + if(type == IMAGE_DATA_TYPE_HALF4) { + type = IMAGE_DATA_TYPE_FLOAT4; + } + else if(type == IMAGE_DATA_TYPE_HALF) { + type = IMAGE_DATA_TYPE_FLOAT; + } } - if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0) { - type = IMAGE_DATA_TYPE_BYTE4; + + if(cuda_fermi_limits) { + if(type == IMAGE_DATA_TYPE_FLOAT) { + type = IMAGE_DATA_TYPE_FLOAT4; + } + else if(type == IMAGE_DATA_TYPE_BYTE) { + type = IMAGE_DATA_TYPE_BYTE4; + } } /* Fnd existing image. */ @@ -338,14 +331,30 @@ int ImageManager::add_image(const string& filename, break; } - if(slot == images[type].size()) { - /* Max images limit reached. */ - if(images[type].size() == tex_num_images[type]) { + /* Count if we're over the limit */ + if(cuda_fermi_limits) { + if(tex_num_images[IMAGE_DATA_TYPE_BYTE4] == TEX_NUM_BYTE4_CUDA + || tex_num_images[IMAGE_DATA_TYPE_FLOAT4] == TEX_NUM_FLOAT4_CUDA) + { printf("ImageManager::add_image: Reached %s image limit (%d), skipping '%s'\n", - name_from_type(type).c_str(), tex_num_images[type], filename.c_str()); + name_from_type(type).c_str(), tex_num_images[type], filename.c_str()); return -1; } + } + else { + /* Very unlikely, since max_num_images is insanely big. But better safe than sorry. */ + int tex_count = 0; + for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { + tex_count += tex_num_images[type]; + } + if(tex_count > max_num_images) { + printf("ImageManager::add_image: Reached image limit (%d), skipping '%s'\n", + max_num_images, filename.c_str()); + return -1; + } + } + if(slot == images[type].size()) { images[type].resize(images[type].size() + 1); } @@ -353,6 +362,7 @@ int ImageManager::add_image(const string& filename, img = new Image(); img->filename = filename; img->builtin_data = builtin_data; + img->builtin_free_cache = builtin_free_cache; img->need_load = true; img->animated = animated; img->frame = frame; @@ -363,6 +373,8 @@ int ImageManager::add_image(const string& filename, images[type][slot] = img; + ++tex_num_images[type]; + need_update = true; return type_index_to_flattened_slot(slot, type); @@ -436,7 +448,12 @@ void ImageManager::tag_reload_image(const string& filename, } } -bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components) +bool ImageManager::file_load_image_generic(Image *img, + ImageInput **in, + int &width, + int &height, + int &depth, + int &components) { if(img->filename == "") return false; @@ -475,8 +492,8 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid if(!builtin_image_info_cb || !builtin_image_pixels_cb) return false; - bool is_float; - builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components); + bool is_float, free_cache; + builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components, free_cache); } /* we only handle certain number of components */ @@ -557,13 +574,15 @@ bool ImageManager::file_load_image(Image *img, builtin_image_float_pixels_cb(img->filename, img->builtin_data, (float*)&pixels[0], - num_pixels * components); + num_pixels * components, + img->builtin_free_cache); } else if(FileFormat == TypeDesc::UINT8) { builtin_image_pixels_cb(img->filename, img->builtin_data, (uchar*)&pixels[0], - num_pixels * components); + num_pixels * components, + img->builtin_free_cache); } else { /* TODO(dingto): Support half for ImBuf. */ @@ -618,6 +637,37 @@ bool ImageManager::file_load_image(Image *img, } } } + /* Make sure we don't have buggy values. */ + if(FileFormat == TypeDesc::FLOAT) { + /* For RGBA buffers we put all channels to 0 if either of them is not + * finite. This way we avoid possible artifacts caused by fully changed + * hue. + */ + if(is_rgba) { + for(size_t i = 0; i < num_pixels; i += 4) { + StorageType *pixel = &pixels[i*4]; + if(!isfinite(pixel[0]) || + !isfinite(pixel[1]) || + !isfinite(pixel[2]) || + !isfinite(pixel[3])) + { + pixel[0] = 0; + pixel[1] = 0; + pixel[2] = 0; + pixel[3] = 0; + } + } + } + else { + for(size_t i = 0; i < num_pixels; ++i) { + StorageType *pixel = &pixels[i]; + if(!isfinite(pixel[0])) { + pixel[0] = 0; + } + } + } + } + /* Scale image down if needed. */ if(pixels_storage.size() > 0) { float scale_factor = 1.0f; while(max_size * scale_factor > texture_limit) { @@ -666,16 +716,12 @@ void ImageManager::device_load_image(Device *device, /* Slot assignment */ int flat_slot = type_index_to_flattened_slot(slot, type); - string name; - if(flat_slot >= 100) - name = string_printf("__tex_image_%s_%d", name_from_type(type).c_str(), flat_slot); - else if(flat_slot >= 10) - name = string_printf("__tex_image_%s_0%d", name_from_type(type).c_str(), flat_slot); - else - name = string_printf("__tex_image_%s_00%d", name_from_type(type).c_str(), flat_slot); + string name = string_printf("__tex_image_%s_%03d", name_from_type(type).c_str(), flat_slot); if(type == IMAGE_DATA_TYPE_FLOAT4) { - device_vector<float4>& tex_img = dscene->tex_float4_image[slot]; + if(dscene->tex_float4_image[slot] == NULL) + dscene->tex_float4_image[slot] = new device_vector<float4>(); + device_vector<float4>& tex_img = *dscene->tex_float4_image[slot]; if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); @@ -705,7 +751,9 @@ void ImageManager::device_load_image(Device *device, } } else if(type == IMAGE_DATA_TYPE_FLOAT) { - device_vector<float>& tex_img = dscene->tex_float_image[slot]; + if(dscene->tex_float_image[slot] == NULL) + dscene->tex_float_image[slot] = new device_vector<float>(); + device_vector<float>& tex_img = *dscene->tex_float_image[slot]; if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); @@ -732,7 +780,9 @@ void ImageManager::device_load_image(Device *device, } } else if(type == IMAGE_DATA_TYPE_BYTE4) { - device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot]; + if(dscene->tex_byte4_image[slot] == NULL) + dscene->tex_byte4_image[slot] = new device_vector<uchar4>(); + device_vector<uchar4>& tex_img = *dscene->tex_byte4_image[slot]; if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); @@ -762,7 +812,9 @@ void ImageManager::device_load_image(Device *device, } } else if(type == IMAGE_DATA_TYPE_BYTE){ - device_vector<uchar>& tex_img = dscene->tex_byte_image[slot]; + if(dscene->tex_byte_image[slot] == NULL) + dscene->tex_byte_image[slot] = new device_vector<uchar>(); + device_vector<uchar>& tex_img = *dscene->tex_byte_image[slot]; if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); @@ -788,7 +840,9 @@ void ImageManager::device_load_image(Device *device, } } else if(type == IMAGE_DATA_TYPE_HALF4){ - device_vector<half4>& tex_img = dscene->tex_half4_image[slot]; + if(dscene->tex_half4_image[slot] == NULL) + dscene->tex_half4_image[slot] = new device_vector<half4>(); + device_vector<half4>& tex_img = *dscene->tex_half4_image[slot]; if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); @@ -817,7 +871,9 @@ void ImageManager::device_load_image(Device *device, } } else if(type == IMAGE_DATA_TYPE_HALF){ - device_vector<half>& tex_img = dscene->tex_half_image[slot]; + if(dscene->tex_half_image[slot] == NULL) + dscene->tex_half_image[slot] = new device_vector<half>(); + device_vector<half>& tex_img = *dscene->tex_half_image[slot]; if(tex_img.device_pointer) { thread_scoped_lock device_lock(device_mutex); @@ -857,69 +913,100 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, ImageD ((OSL::TextureSystem*)osl_texture_system)->invalidate(filename); #endif } - else if(type == IMAGE_DATA_TYPE_FLOAT4) { - device_vector<float4>& tex_img = dscene->tex_float4_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_FLOAT) { - device_vector<float>& tex_img = dscene->tex_float_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_BYTE4) { - device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_BYTE){ - device_vector<uchar>& tex_img = dscene->tex_byte_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_HALF4){ - device_vector<half4>& tex_img = dscene->tex_half4_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); + else { + device_memory *tex_img = NULL; + switch(type) { + case IMAGE_DATA_TYPE_FLOAT4: + if(slot >= dscene->tex_float4_image.size()) { + break; + } + tex_img = dscene->tex_float4_image[slot]; + dscene->tex_float4_image[slot] = NULL; + break; + case IMAGE_DATA_TYPE_BYTE4: + if(slot >= dscene->tex_byte4_image.size()) { + break; + } + tex_img = dscene->tex_byte4_image[slot]; + dscene->tex_byte4_image[slot]= NULL; + break; + case IMAGE_DATA_TYPE_HALF4: + if(slot >= dscene->tex_half4_image.size()) { + break; + } + tex_img = dscene->tex_half4_image[slot]; + dscene->tex_half4_image[slot]= NULL; + break; + case IMAGE_DATA_TYPE_FLOAT: + if(slot >= dscene->tex_float_image.size()) { + break; + } + tex_img = dscene->tex_float_image[slot]; + dscene->tex_float_image[slot] = NULL; + break; + case IMAGE_DATA_TYPE_BYTE: + if(slot >= dscene->tex_byte_image.size()) { + break; + } + tex_img = dscene->tex_byte_image[slot]; + dscene->tex_byte_image[slot]= NULL; + break; + case IMAGE_DATA_TYPE_HALF: + if(slot >= dscene->tex_half_image.size()) { + break; + } + tex_img = dscene->tex_half_image[slot]; + dscene->tex_half_image[slot]= NULL; + break; + default: + assert(0); + tex_img = NULL; } + if(tex_img) { + if(tex_img->device_pointer) { + thread_scoped_lock device_lock(device_mutex); + device->tex_free(*tex_img); + } - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_HALF){ - device_vector<half>& tex_img = dscene->tex_half_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); + delete tex_img; } - - tex_img.clear(); } delete images[type][slot]; images[type][slot] = NULL; + --tex_num_images[type]; + } +} + +void ImageManager::device_prepare_update(DeviceScene *dscene) +{ + for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { + switch(type) { + case IMAGE_DATA_TYPE_FLOAT4: + if(dscene->tex_float4_image.size() <= tex_num_images[IMAGE_DATA_TYPE_FLOAT4]) + dscene->tex_float4_image.resize(tex_num_images[IMAGE_DATA_TYPE_FLOAT4]); + break; + case IMAGE_DATA_TYPE_BYTE4: + if(dscene->tex_byte4_image.size() <= tex_num_images[IMAGE_DATA_TYPE_BYTE4]) + dscene->tex_byte4_image.resize(tex_num_images[IMAGE_DATA_TYPE_BYTE4]); + break; + case IMAGE_DATA_TYPE_HALF4: + if(dscene->tex_half4_image.size() <= tex_num_images[IMAGE_DATA_TYPE_HALF4]) + dscene->tex_half4_image.resize(tex_num_images[IMAGE_DATA_TYPE_HALF4]); + break; + case IMAGE_DATA_TYPE_BYTE: + if(dscene->tex_byte_image.size() <= tex_num_images[IMAGE_DATA_TYPE_BYTE]) + dscene->tex_byte_image.resize(tex_num_images[IMAGE_DATA_TYPE_BYTE]); + break; + case IMAGE_DATA_TYPE_FLOAT: + if(dscene->tex_float_image.size() <= tex_num_images[IMAGE_DATA_TYPE_FLOAT]) + dscene->tex_float_image.resize(tex_num_images[IMAGE_DATA_TYPE_FLOAT]); + break; + case IMAGE_DATA_TYPE_HALF: + if(dscene->tex_half_image.size() <= tex_num_images[IMAGE_DATA_TYPE_HALF]) + dscene->tex_half_image.resize(tex_num_images[IMAGE_DATA_TYPE_HALF]); + break; + } } } @@ -928,11 +1015,14 @@ void ImageManager::device_update(Device *device, Scene *scene, Progress& progress) { - if(!need_update) + if(!need_update) { return; + } - TaskPool pool; + /* Make sure arrays are proper size. */ + device_prepare_update(dscene); + TaskPool pool; for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { for(size_t slot = 0; slot < images[type].size(); slot++) { if(!images[type][slot]) @@ -992,159 +1082,101 @@ void ImageManager::device_update_slot(Device *device, uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot) { uint8_t options = 0; - /* Image Options are packed into one uint: * bit 0 -> Interpolation - * bit 1 + 2 + 3-> Extension */ - if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) + * bit 1 + 2 + 3 -> Extension + */ + if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) { options |= (1 << 0); - - if(images[type][slot]->extension == EXTENSION_REPEAT) + } + if(images[type][slot]->extension == EXTENSION_REPEAT) { options |= (1 << 1); - else if(images[type][slot]->extension == EXTENSION_EXTEND) + } + else if(images[type][slot]->extension == EXTENSION_EXTEND) { options |= (1 << 2); - else /* EXTENSION_CLIP */ + } + else /* EXTENSION_CLIP */ { options |= (1 << 3); - + } return options; } -void ImageManager::device_pack_images(Device *device, - DeviceScene *dscene, - Progress& /*progess*/) +template<typename T> +void ImageManager::device_pack_images_type( + ImageDataType type, + const vector<device_vector<T>*>& cpu_textures, + device_vector<T> *device_image, + uint4 *info) { - /* For OpenCL, we pack all image textures into a single large texture, and - * do our own interpolation in the kernel. */ size_t size = 0, offset = 0; - ImageDataType type; - - int info_size = tex_num_images[IMAGE_DATA_TYPE_FLOAT4] + tex_num_images[IMAGE_DATA_TYPE_BYTE4] - + tex_num_images[IMAGE_DATA_TYPE_FLOAT] + tex_num_images[IMAGE_DATA_TYPE_BYTE]; - uint4 *info = dscene->tex_image_packed_info.resize(info_size*2); - - /* Byte4 Textures*/ - type = IMAGE_DATA_TYPE_BYTE4; - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot]; - size += tex_img.size(); - } - - uchar4 *pixels_byte4 = dscene->tex_image_byte4_packed.resize(size); - + /* First step is to calculate size of the texture we need. */ for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot]; - - uint8_t options = pack_image_options(type, slot); - - int index = type_index_to_flattened_slot(slot, type) * 2; - info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); - info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0); - - memcpy(pixels_byte4+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); - offset += tex_img.size(); - } - - /* Float4 Textures*/ - type = IMAGE_DATA_TYPE_FLOAT4; - size = 0, offset = 0; - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<float4>& tex_img = dscene->tex_float4_image[slot]; - size += tex_img.size(); - } - - float4 *pixels_float4 = dscene->tex_image_float4_packed.resize(size); - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<float4>& tex_img = dscene->tex_float4_image[slot]; - - /* todo: support 3D textures, only CPU for now */ - - uint8_t options = pack_image_options(type, slot); - - int index = type_index_to_flattened_slot(slot, type) * 2; - info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); - info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0); - - memcpy(pixels_float4+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); - offset += tex_img.size(); - } - - /* Byte Textures*/ - type = IMAGE_DATA_TYPE_BYTE; - size = 0, offset = 0; - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) + if(images[type][slot] == NULL) { continue; - - device_vector<uchar>& tex_img = dscene->tex_byte_image[slot]; + } + device_vector<T>& tex_img = *cpu_textures[slot]; size += tex_img.size(); } - - uchar *pixels_byte = dscene->tex_image_byte_packed.resize(size); - + /* Now we know how much memory we need, so we can allocate and fill. */ + T *pixels = device_image->resize(size); for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) + if(images[type][slot] == NULL) { continue; - - device_vector<uchar>& tex_img = dscene->tex_byte_image[slot]; - + } + device_vector<T>& tex_img = *cpu_textures[slot]; uint8_t options = pack_image_options(type, slot); - - int index = type_index_to_flattened_slot(slot, type) * 2; - info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); + const int index = type_index_to_flattened_slot(slot, type) * 2; + info[index] = make_uint4(tex_img.data_width, + tex_img.data_height, + offset, + options); info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0); - - memcpy(pixels_byte+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); + memcpy(pixels + offset, + (void*)tex_img.data_pointer, + tex_img.memory_size()); offset += tex_img.size(); } +} - /* Float Textures*/ - type = IMAGE_DATA_TYPE_FLOAT; - size = 0, offset = 0; - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<float>& tex_img = dscene->tex_float_image[slot]; - size += tex_img.size(); - } - - float *pixels_float = dscene->tex_image_float_packed.resize(size); - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<float>& tex_img = dscene->tex_float_image[slot]; - - /* todo: support 3D textures, only CPU for now */ - - uint8_t options = pack_image_options(type, slot); - - int index = type_index_to_flattened_slot(slot, type) * 2; - info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); - info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0); +void ImageManager::device_pack_images(Device *device, + DeviceScene *dscene, + Progress& /*progess*/) +{ + /* For OpenCL, we pack all image textures into a single large texture, and + * do our own interpolation in the kernel. + */ - memcpy(pixels_float+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); - offset += tex_img.size(); - } + /* TODO(sergey): This will over-allocate a bit, but this is constant memory + * so should be fine for a short term. + */ + const size_t info_size = max4(max_flattened_slot(IMAGE_DATA_TYPE_FLOAT4), + max_flattened_slot(IMAGE_DATA_TYPE_BYTE4), + max_flattened_slot(IMAGE_DATA_TYPE_FLOAT), + max_flattened_slot(IMAGE_DATA_TYPE_BYTE)); + uint4 *info = dscene->tex_image_packed_info.resize(info_size*2); + /* Pack byte4 textures. */ + device_pack_images_type(IMAGE_DATA_TYPE_BYTE4, + dscene->tex_byte4_image, + &dscene->tex_image_byte4_packed, + info); + /* Pack float4 textures. */ + device_pack_images_type(IMAGE_DATA_TYPE_FLOAT4, + dscene->tex_float4_image, + &dscene->tex_image_float4_packed, + info); + /* Pack byte textures. */ + device_pack_images_type(IMAGE_DATA_TYPE_BYTE, + dscene->tex_byte_image, + &dscene->tex_image_byte_packed, + info); + /* Pack float textures. */ + device_pack_images_type(IMAGE_DATA_TYPE_FLOAT, + dscene->tex_float_image, + &dscene->tex_image_float_packed, + info); + + /* Push textures to the device. */ if(dscene->tex_image_byte4_packed.size()) { if(dscene->tex_image_byte4_packed.device_pointer) { thread_scoped_lock device_lock(device_mutex); @@ -1201,16 +1233,23 @@ void ImageManager::device_free(Device *device, DeviceScene *dscene) images[type].clear(); } - device->tex_free(dscene->tex_image_byte4_packed); + dscene->tex_float4_image.clear(); + dscene->tex_byte4_image.clear(); + dscene->tex_half4_image.clear(); + dscene->tex_float_image.clear(); + dscene->tex_byte_image.clear(); + dscene->tex_half_image.clear(); + device->tex_free(dscene->tex_image_float4_packed); - device->tex_free(dscene->tex_image_byte_packed); + device->tex_free(dscene->tex_image_byte4_packed); device->tex_free(dscene->tex_image_float_packed); + device->tex_free(dscene->tex_image_byte_packed); device->tex_free(dscene->tex_image_packed_info); - dscene->tex_image_byte4_packed.clear(); dscene->tex_image_float4_packed.clear(); - dscene->tex_image_byte_packed.clear(); + dscene->tex_image_byte4_packed.clear(); dscene->tex_image_float_packed.clear(); + dscene->tex_image_byte_packed.clear(); dscene->tex_image_packed_info.clear(); } diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index 996b5a5b65f..db7e28a5e44 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -37,17 +37,6 @@ public: explicit ImageManager(const DeviceInfo& info); ~ImageManager(); - enum ImageDataType { - IMAGE_DATA_TYPE_FLOAT4 = 0, - IMAGE_DATA_TYPE_BYTE4 = 1, - IMAGE_DATA_TYPE_HALF4 = 2, - IMAGE_DATA_TYPE_FLOAT = 3, - IMAGE_DATA_TYPE_BYTE = 4, - IMAGE_DATA_TYPE_HALF = 5, - - IMAGE_DATA_NUM_TYPES - }; - int add_image(const string& filename, void *builtin_data, bool animated, @@ -68,8 +57,12 @@ public: InterpolationType interpolation, ExtensionType extension, bool use_alpha); - ImageDataType get_image_metadata(const string& filename, void *builtin_data, bool& is_linear); + ImageDataType get_image_metadata(const string& filename, + void *builtin_data, + bool& is_linear, + bool& builtin_free_cache); + void device_prepare_update(DeviceScene *dscene); void device_update(Device *device, DeviceScene *dscene, Scene *scene, @@ -98,19 +91,23 @@ public: int &width, int &height, int &depth, - int &channels)> builtin_image_info_cb; + int &channels, + bool &free_cache)> builtin_image_info_cb; function<bool(const string &filename, void *data, unsigned char *pixels, - const size_t pixels_size)> builtin_image_pixels_cb; + const size_t pixels_size, + const bool free_cache)> builtin_image_pixels_cb; function<bool(const string &filename, void *data, float *pixels, - const size_t pixels_size)> builtin_image_float_pixels_cb; + const size_t pixels_size, + const bool free_cache)> builtin_image_float_pixels_cb; struct Image { string filename; void *builtin_data; + bool builtin_free_cache; bool use_alpha; bool need_load; @@ -124,7 +121,9 @@ public: private: int tex_num_images[IMAGE_DATA_NUM_TYPES]; - int tex_start_images[IMAGE_DATA_NUM_TYPES]; + int max_num_images; + bool has_half_images; + bool cuda_fermi_limits; thread_mutex device_mutex; int animation_frame; @@ -133,7 +132,12 @@ private: void *osl_texture_system; bool pack_images; - bool file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components); + bool file_load_image_generic(Image *img, + ImageInput **in, + int &width, + int &height, + int &depth, + int &components); template<TypeDesc::BASETYPE FileFormat, typename StorageType, @@ -143,6 +147,7 @@ private: int texture_limit, device_vector<DeviceType>& tex_img); + int max_flattened_slot(ImageDataType type); int type_index_to_flattened_slot(int slot, ImageDataType type); int flattened_slot_to_type_index(int flat_slot, ImageDataType *type); string name_from_type(int type); @@ -160,6 +165,13 @@ private: ImageDataType type, int slot); + template<typename T> + void device_pack_images_type( + ImageDataType type, + const vector<device_vector<T>*>& cpu_textures, + device_vector<T> *device_image, + uint4 *info); + void device_pack_images(Device *device, DeviceScene *dscene, Progress& progess); diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 4886dcd563f..93d88c5642c 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -224,6 +224,10 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene) bool LightManager::object_usable_as_light(Object *object) { Mesh *mesh = object->mesh; + /* Skip objects with NaNs */ + if (!object->bounds.valid()) { + return false; + } /* Skip if we are not visible for BSDFs. */ if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) { return false; @@ -486,18 +490,10 @@ static void background_cdf(int start, float2 *cond_cdf) { /* Conditional CDFs (rows, U direction). */ - /* NOTE: It is possible to have some NaN pixels on background - * which will ruin CDF causing wrong shading. We replace such - * pixels with black. - */ for(int i = start; i < end; i++) { float sin_theta = sinf(M_PI_F * (i + 0.5f) / res); float3 env_color = (*pixels)[i * res]; float ave_luminance = average(env_color); - /* TODO(sergey): Consider adding average_safe(). */ - if(!isfinite(ave_luminance)) { - ave_luminance = 0.0f; - } cond_cdf[i * cdf_count].x = ave_luminance * sin_theta; cond_cdf[i * cdf_count].y = 0.0f; @@ -505,9 +501,6 @@ static void background_cdf(int start, for(int j = 1; j < res; j++) { env_color = (*pixels)[i * res + j]; ave_luminance = average(env_color); - if(!isfinite(ave_luminance)) { - ave_luminance = 0.0f; - } cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta; cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res; diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index a4dc06c4345..03825f780e0 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -903,7 +903,7 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal) float3 vNi = vN[i]; if(do_transform) - vNi = normalize(transform_direction(&ntfm, vNi)); + vNi = safe_normalize(transform_direction(&ntfm, vNi)); vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f); } @@ -1944,6 +1944,7 @@ void MeshManager::device_update_displacement_images(Device *device, } } } + image_manager->device_prepare_update(dscene); foreach(int slot, bump_images) { pool.push(function_bind(&ImageManager::device_update_slot, image_manager, diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp index cf28bb16bb7..4ca20cf7ef3 100644 --- a/intern/cycles/render/mesh_displace.cpp +++ b/intern/cycles/render/mesh_displace.cpp @@ -169,6 +169,8 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me if(!done[t.v[j]]) { done[t.v[j]] = true; float3 off = float4_to_float3(offset[k++]); + /* Avoid illegal vertex coordinates. */ + off = ensure_finite3(off); mesh->verts[t.v[j]] += off; if(attr_mP != NULL) { for(int step = 0; step < mesh->motion_steps - 1; step++) { diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index 1070e05a03b..90a68a06cb5 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -364,9 +364,10 @@ void ImageTextureNode::compile(OSLCompiler& compiler) image_manager = compiler.image_manager; if(is_float == -1) { if(builtin_data == NULL) { - ImageManager::ImageDataType type; - type = image_manager->get_image_metadata(filename.string(), NULL, is_linear); - if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4) + ImageDataType type; + bool builtin_free_cache; + type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache); + if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4) is_float = 1; } else { @@ -553,9 +554,10 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler) image_manager = compiler.image_manager; if(is_float == -1) { if(builtin_data == NULL) { - ImageManager::ImageDataType type; - type = image_manager->get_image_metadata(filename.string(), NULL, is_linear); - if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4) + ImageDataType type; + bool builtin_free_cache; + type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache); + if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4) is_float = 1; } else { @@ -1791,12 +1793,19 @@ void ConvertNode::compile(OSLCompiler& compiler) assert(0); } +/* Base type for all closure-type nodes */ + +BsdfBaseNode::BsdfBaseNode(const NodeType *node_type) + : ShaderNode(node_type) +{ + special_type = SHADER_SPECIAL_TYPE_CLOSURE; +} + /* BSDF Closure */ BsdfNode::BsdfNode(const NodeType *node_type) -: ShaderNode(node_type) +: BsdfBaseNode(node_type) { - special_type = SHADER_SPECIAL_TYPE_CLOSURE; } void BsdfNode::compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3, ShaderInput *param4) @@ -2286,6 +2295,155 @@ void DiffuseBsdfNode::compile(OSLCompiler& compiler) compiler.add(this, "node_diffuse_bsdf"); } +/* Disney principled BSDF Closure */ +NODE_DEFINE(PrincipledBsdfNode) +{ + NodeType* type = NodeType::add("principled_bsdf", create, NodeType::SHADER); + + static NodeEnum distribution_enum; + distribution_enum.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID); + distribution_enum.insert("Multiscatter GGX", CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID); + SOCKET_ENUM(distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID); + SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f)); + SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f)); + SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f); + SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f); + SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f)); + SOCKET_IN_FLOAT(specular, "Specular", 0.0f); + SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f); + SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f); + SOCKET_IN_FLOAT(anisotropic, "Anisotropic", 0.0f); + SOCKET_IN_FLOAT(sheen, "Sheen", 0.0f); + SOCKET_IN_FLOAT(sheen_tint, "Sheen Tint", 0.0f); + SOCKET_IN_FLOAT(clearcoat, "Clearcoat", 0.0f); + SOCKET_IN_FLOAT(clearcoat_roughness, "Clearcoat Roughness", 0.03f); + SOCKET_IN_FLOAT(ior, "IOR", 0.0f); + SOCKET_IN_FLOAT(transmission, "Transmission", 0.0f); + SOCKET_IN_FLOAT(transmission_roughness, "Transmission Roughness", 0.0f); + SOCKET_IN_FLOAT(anisotropic_rotation, "Anisotropic Rotation", 0.0f); + SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL); + SOCKET_IN_NORMAL(clearcoat_normal, "Clearcoat Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL); + SOCKET_IN_NORMAL(tangent, "Tangent", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TANGENT); + SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL); + + SOCKET_OUT_CLOSURE(BSDF, "BSDF"); + + return type; +} + +PrincipledBsdfNode::PrincipledBsdfNode() + : BsdfBaseNode(node_type) +{ + closure = CLOSURE_BSDF_PRINCIPLED_ID; + distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID; + distribution_orig = NBUILTIN_CLOSURES; +} + +bool PrincipledBsdfNode::has_surface_bssrdf() +{ + ShaderInput *subsurface_in = input("Subsurface"); + return (subsurface_in->link != NULL || subsurface > CLOSURE_WEIGHT_CUTOFF); +} + +void PrincipledBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes) +{ + if(shader->has_surface) { + ShaderInput *tangent_in = input("Tangent"); + + if(!tangent_in->link) + attributes->add(ATTR_STD_GENERATED); + } + + ShaderNode::attributes(shader, attributes); +} + +void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic, ShaderInput *p_subsurface, ShaderInput *p_subsurface_radius, + ShaderInput *p_specular, ShaderInput *p_roughness, ShaderInput *p_specular_tint, ShaderInput *p_anisotropic, + ShaderInput *p_sheen, ShaderInput *p_sheen_tint, ShaderInput *p_clearcoat, ShaderInput *p_clearcoat_roughness, + ShaderInput *p_ior, ShaderInput *p_transmission, ShaderInput *p_anisotropic_rotation, ShaderInput *p_transmission_roughness) +{ + ShaderInput *base_color_in = input("Base Color"); + ShaderInput *subsurface_color_in = input("Subsurface Color"); + ShaderInput *normal_in = input("Normal"); + ShaderInput *clearcoat_normal_in = input("Clearcoat Normal"); + ShaderInput *tangent_in = input("Tangent"); + + float3 weight = make_float3(1.0f, 1.0f, 1.0f); + + compiler.add_node(NODE_CLOSURE_SET_WEIGHT, weight); + + int normal_offset = compiler.stack_assign_if_linked(normal_in); + int clearcoat_normal_offset = compiler.stack_assign_if_linked(clearcoat_normal_in); + int tangent_offset = compiler.stack_assign_if_linked(tangent_in); + int specular_offset = compiler.stack_assign(p_specular); + int roughness_offset = compiler.stack_assign(p_roughness); + int specular_tint_offset = compiler.stack_assign(p_specular_tint); + int anisotropic_offset = compiler.stack_assign(p_anisotropic); + int sheen_offset = compiler.stack_assign(p_sheen); + int sheen_tint_offset = compiler.stack_assign(p_sheen_tint); + int clearcoat_offset = compiler.stack_assign(p_clearcoat); + int clearcoat_roughness_offset = compiler.stack_assign(p_clearcoat_roughness); + int ior_offset = compiler.stack_assign(p_ior); + int transmission_offset = compiler.stack_assign(p_transmission); + int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness); + int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation); + int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius); + + compiler.add_node(NODE_CLOSURE_BSDF, + compiler.encode_uchar4(closure, + compiler.stack_assign(p_metallic), + compiler.stack_assign(p_subsurface), + compiler.closure_mix_weight_offset()), + __float_as_int((p_metallic) ? get_float(p_metallic->socket_type) : 0.0f), + __float_as_int((p_subsurface) ? get_float(p_subsurface->socket_type) : 0.0f)); + + compiler.add_node(normal_offset, tangent_offset, + compiler.encode_uchar4(specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset), + compiler.encode_uchar4(sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset)); + + compiler.add_node(compiler.encode_uchar4(ior_offset, transmission_offset, anisotropic_rotation_offset, transmission_roughness_offset), + distribution, SVM_STACK_INVALID, SVM_STACK_INVALID); + + float3 bc_default = get_float3(base_color_in->socket_type); + + compiler.add_node(((base_color_in->link) ? compiler.stack_assign(base_color_in) : SVM_STACK_INVALID), + __float_as_int(bc_default.x), __float_as_int(bc_default.y), __float_as_int(bc_default.z)); + + compiler.add_node(clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID); + + float3 ss_default = get_float3(subsurface_color_in->socket_type); + + compiler.add_node(((subsurface_color_in->link) ? compiler.stack_assign(subsurface_color_in) : SVM_STACK_INVALID), + __float_as_int(ss_default.x), __float_as_int(ss_default.y), __float_as_int(ss_default.z)); +} + +bool PrincipledBsdfNode::has_integrator_dependency() +{ + ShaderInput *roughness_input = input("Roughness"); + return !roughness_input->link && roughness <= 1e-4f; +} + +void PrincipledBsdfNode::compile(SVMCompiler& compiler) +{ + compile(compiler, input("Metallic"), input("Subsurface"), input("Subsurface Radius"), input("Specular"), + input("Roughness"), input("Specular Tint"), input("Anisotropic"), input("Sheen"), input("Sheen Tint"), + input("Clearcoat"), input("Clearcoat Roughness"), input("IOR"), input("Transmission"), + input("Anisotropic Rotation"), input("Transmission Roughness")); +} + +void PrincipledBsdfNode::compile(OSLCompiler& compiler) +{ + compiler.parameter(this, "distribution"); + compiler.add(this, "node_principled_bsdf"); +} + +bool PrincipledBsdfNode::has_bssrdf_bump() +{ + /* detect if anything is plugged into the normal input besides the default */ + ShaderInput *normal_in = input("Normal"); + return (normal_in->link && normal_in->link->parent->special_type != SHADER_SPECIAL_TYPE_GEOMETRY); +} + /* Translucent BSDF Closure */ NODE_DEFINE(TranslucentBsdfNode) diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index a755b653a5b..c0271a3c8eb 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -252,6 +252,7 @@ public: class PointDensityTextureNode : public ShaderNode { public: SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode) + virtual int get_group() { return NODE_GROUP_LEVEL_3; } ~PointDensityTextureNode(); ShaderNode *clone() const; @@ -321,7 +322,14 @@ private: static bool initialized; }; -class BsdfNode : public ShaderNode { +class BsdfBaseNode : public ShaderNode { +public: + BsdfBaseNode(const NodeType *node_type); + + ClosureType closure; +}; + +class BsdfNode : public BsdfBaseNode { public: explicit BsdfNode(const NodeType *node_type); SHADER_NODE_BASE_CLASS(BsdfNode) @@ -333,7 +341,6 @@ public: float3 color; float3 normal; float surface_mix_weight; - ClosureType closure; virtual bool equals(const ShaderNode& /*other*/) { @@ -361,6 +368,39 @@ public: float roughness; }; +/* Disney principled BRDF */ +class PrincipledBsdfNode : public BsdfBaseNode { +public: + SHADER_NODE_CLASS(PrincipledBsdfNode) + + bool has_spatial_varying() { return true; } + bool has_surface_bssrdf(); + bool has_bssrdf_bump(); + void compile(SVMCompiler& compiler, ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius, + ShaderInput *specular, ShaderInput *roughness, ShaderInput *specular_tint, ShaderInput *anisotropic, + ShaderInput *sheen, ShaderInput *sheen_tint, ShaderInput *clearcoat, ShaderInput *clearcoat_roughness, + ShaderInput *ior, ShaderInput *transmission, ShaderInput *anisotropic_rotation, ShaderInput *transmission_roughness); + + float3 base_color; + float3 subsurface_color, subsurface_radius; + float metallic, subsurface, specular, roughness, specular_tint, anisotropic, + sheen, sheen_tint, clearcoat, clearcoat_roughness, ior, transmission, + anisotropic_rotation, transmission_roughness; + float3 normal, clearcoat_normal, tangent; + float surface_mix_weight; + ClosureType distribution, distribution_orig; + + virtual bool equals(const ShaderNode * /*other*/) + { + /* TODO(sergey): With some care BSDF nodes can be de-duplicated. */ + return false; + } + + ClosureType get_closure_type() { return closure; } + bool has_integrator_dependency(); + void attributes(Shader *shader, AttributeRequestSet *attributes); +}; + class TranslucentBsdfNode : public BsdfNode { public: SHADER_NODE_CLASS(TranslucentBsdfNode) @@ -445,6 +485,7 @@ public: virtual ClosureType get_closure_type() { return CLOSURE_EMISSION_ID; } bool has_surface_emission() { return true; } + bool has_volume_support() { return true; } float3 color; float strength; @@ -496,6 +537,7 @@ public: return ShaderNode::get_feature() | NODE_FEATURE_VOLUME; } virtual ClosureType get_closure_type() { return closure; } + virtual bool has_volume_support() { return true; } float3 color; float density; diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp index 6bff29d1c76..a794f233718 100644 --- a/intern/cycles/render/osl.cpp +++ b/intern/cycles/render/osl.cpp @@ -156,6 +156,7 @@ void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s og->surface_state.clear(); og->volume_state.clear(); og->displacement_state.clear(); + og->bump_state.clear(); og->background_state.reset(); } diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index 2b5267642a2..4c2c4f5fcc3 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -114,18 +114,18 @@ public: device_vector<uint> sobol_directions; /* cpu images */ - device_vector<uchar4> tex_byte4_image[TEX_NUM_BYTE4_CPU]; - device_vector<float4> tex_float4_image[TEX_NUM_FLOAT4_CPU]; - device_vector<float> tex_float_image[TEX_NUM_FLOAT_CPU]; - device_vector<uchar> tex_byte_image[TEX_NUM_BYTE_CPU]; - device_vector<half4> tex_half4_image[TEX_NUM_HALF4_CPU]; - device_vector<half> tex_half_image[TEX_NUM_HALF_CPU]; + vector<device_vector<float4>* > tex_float4_image; + vector<device_vector<uchar4>* > tex_byte4_image; + vector<device_vector<half4>* > tex_half4_image; + vector<device_vector<float>* > tex_float_image; + vector<device_vector<uchar>* > tex_byte_image; + vector<device_vector<half>* > tex_half_image; /* opencl images */ - device_vector<uchar4> tex_image_byte4_packed; device_vector<float4> tex_image_float4_packed; - device_vector<uchar> tex_image_byte_packed; + device_vector<uchar4> tex_image_byte4_packed; device_vector<float> tex_image_float_packed; + device_vector<uchar> tex_image_byte_packed; device_vector<uint4> tex_image_packed_info; KernelData data; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index c9b5547b407..8622318858e 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -114,8 +114,9 @@ Session::~Session() } /* clean up */ - foreach(RenderBuffers *buffers, tile_buffers) - delete buffers; + foreach(RenderTile &rtile, render_tiles) + delete rtile.buffers; + tile_manager.free_device(); delete buffers; delete display; @@ -268,8 +269,8 @@ void Session::run_gpu() /* update status and timing */ update_status_time(); - /* path trace */ - path_trace(); + /* render */ + render(); device->task_wait(); @@ -358,20 +359,22 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) thread_scoped_lock tile_lock(tile_mutex); /* get next tile from manager */ - Tile tile; + Tile *tile; int device_num = device->device_number(tile_device); if(!tile_manager.next_tile(tile, device_num)) return false; /* fill render tile */ - rtile.x = tile_manager.state.buffer.full_x + tile.x; - rtile.y = tile_manager.state.buffer.full_y + tile.y; - rtile.w = tile.w; - rtile.h = tile.h; + rtile.x = tile_manager.state.buffer.full_x + tile->x; + rtile.y = tile_manager.state.buffer.full_y + tile->y; + rtile.w = tile->w; + rtile.h = tile->h; rtile.start_sample = tile_manager.state.sample; rtile.num_samples = tile_manager.state.num_samples; rtile.resolution = tile_manager.state.resolution_divider; + rtile.tile_index = tile->index; + rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE; tile_lock.unlock(); @@ -383,54 +386,70 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) rtile.buffer = buffers->buffer.device_pointer; rtile.rng_state = buffers->rng_state.device_pointer; rtile.buffers = buffers; + tile->buffers = buffers; device->map_tile(tile_device, rtile); return true; } - /* fill buffer parameters */ - BufferParams buffer_params = tile_manager.params; - buffer_params.full_x = rtile.x; - buffer_params.full_y = rtile.y; - buffer_params.width = rtile.w; - buffer_params.height = rtile.h; - - buffer_params.get_offset_stride(rtile.offset, rtile.stride); - - RenderBuffers *tilebuffers; + bool store_rtile = false; + if(tile->buffers == NULL) { + /* fill buffer parameters */ + BufferParams buffer_params = tile_manager.params; + buffer_params.full_x = rtile.x; + buffer_params.full_y = rtile.y; + buffer_params.width = rtile.w; + buffer_params.height = rtile.h; + + /* allocate buffers */ + if(params.progressive_refine) { + tile_lock.lock(); + + if(render_tiles.size() == 0) { + RenderTile nulltile; + nulltile.buffers = NULL; + render_tiles.resize(tile_manager.state.num_tiles, nulltile); + } - /* allocate buffers */ - if(params.progressive_refine) { - tile_lock.lock(); + /* In certain circumstances number of tiles in the tile manager could + * be changed. This is not supported by the progressive refine feature. + */ + assert(render_tiles.size() == tile_manager.state.num_tiles); - if(tile_buffers.size() == 0) - tile_buffers.resize(tile_manager.state.num_tiles, NULL); + RenderTile &stored_rtile = render_tiles[tile->index]; + if(stored_rtile.buffers == NULL) { + tile->buffers = new RenderBuffers(tile_device); + tile->buffers->reset(tile_device, buffer_params); + store_rtile = true; + } + else { + assert(rtile.x == stored_rtile.x && + rtile.y == stored_rtile.y && + rtile.w == stored_rtile.w && + rtile.h == stored_rtile.h); + tile_lock.unlock(); + tile->buffers = stored_rtile.buffers; + } + } + else { + tile->buffers = new RenderBuffers(tile_device); - /* In certain circumstances number of tiles in the tile manager could - * be changed. This is not supported by the progressive refine feature. - */ - assert(tile_buffers.size() == tile_manager.state.num_tiles); + tile->buffers->reset(tile_device, buffer_params); + } + } - tilebuffers = tile_buffers[tile.index]; - if(tilebuffers == NULL) { - tilebuffers = new RenderBuffers(tile_device); - tile_buffers[tile.index] = tilebuffers; + tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride); - tilebuffers->reset(tile_device, buffer_params); - } + rtile.buffer = tile->buffers->buffer.device_pointer; + rtile.rng_state = tile->buffers->rng_state.device_pointer; + rtile.buffers = tile->buffers; + rtile.sample = 0; + if(store_rtile) { + render_tiles[tile->index] = rtile; tile_lock.unlock(); } - else { - tilebuffers = new RenderBuffers(tile_device); - - tilebuffers->reset(tile_device, buffer_params); - } - - rtile.buffer = tilebuffers->buffer.device_pointer; - rtile.rng_state = tilebuffers->rng_state.device_pointer; - rtile.buffers = tilebuffers; /* this will tag tile as IN PROGRESS in blender-side render pipeline, * which is needed to highlight currently rendering tile before first @@ -449,7 +468,7 @@ void Session::update_tile_sample(RenderTile& rtile) if(params.progressive_refine == false) { /* todo: optimize this by making it thread safe and removing lock */ - update_render_tile_cb(rtile); + update_render_tile_cb(rtile, true); } } @@ -460,20 +479,77 @@ void Session::release_tile(RenderTile& rtile) { thread_scoped_lock tile_lock(tile_mutex); - progress.add_finished_tile(); + progress.add_finished_tile(rtile.task == RenderTile::DENOISE); - if(write_render_tile_cb) { - if(params.progressive_refine == false) { - /* todo: optimize this by making it thread safe and removing lock */ - write_render_tile_cb(rtile); + bool delete_tile; - delete rtile.buffers; + if(tile_manager.finish_tile(rtile.tile_index, delete_tile)) { + if(write_render_tile_cb && params.progressive_refine == false) { + write_render_tile_cb(rtile); + if(delete_tile) { + delete rtile.buffers; + tile_manager.state.tiles[rtile.tile_index].buffers = NULL; + } + } + } + else { + if(update_render_tile_cb && params.progressive_refine == false) { + update_render_tile_cb(rtile, false); } } update_status_time(); } +void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device) +{ + thread_scoped_lock tile_lock(tile_mutex); + + int center_idx = tiles[4].tile_index; + assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE); + BufferParams buffer_params = tile_manager.params; + int4 image_region = make_int4(buffer_params.full_x, buffer_params.full_y, + buffer_params.full_x + buffer_params.width, buffer_params.full_y + buffer_params.height); + + for(int dy = -1, i = 0; dy <= 1; dy++) { + for(int dx = -1; dx <= 1; dx++, i++) { + int px = tiles[4].x + dx*params.tile_size.x; + int py = tiles[4].y + dy*params.tile_size.y; + if(px >= image_region.x && py >= image_region.y && + px < image_region.z && py < image_region.w) { + int tile_index = center_idx + dy*tile_manager.state.tile_stride + dx; + Tile *tile = &tile_manager.state.tiles[tile_index]; + assert(tile->buffers); + + tiles[i].buffer = tile->buffers->buffer.device_pointer; + tiles[i].x = tile_manager.state.buffer.full_x + tile->x; + tiles[i].y = tile_manager.state.buffer.full_y + tile->y; + tiles[i].w = tile->w; + tiles[i].h = tile->h; + tiles[i].buffers = tile->buffers; + + tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride); + } + else { + tiles[i].buffer = (device_ptr)NULL; + tiles[i].buffers = NULL; + tiles[i].x = clamp(px, image_region.x, image_region.z); + tiles[i].y = clamp(py, image_region.y, image_region.w); + tiles[i].w = tiles[i].h = 0; + } + } + } + + assert(tiles[4].buffers); + device->map_neighbor_tiles(tile_device, tiles); +} + +void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device) +{ + thread_scoped_lock tile_lock(tile_mutex); + device->unmap_neighbor_tiles(tile_device, tiles); +} + void Session::run_cpu() { bool tiles_written = false; @@ -558,8 +634,8 @@ void Session::run_cpu() /* update status and timing */ update_status_time(); - /* path trace */ - path_trace(); + /* render */ + render(); /* update status and timing */ update_status_time(); @@ -646,20 +722,25 @@ DeviceRequestedFeatures Session::get_requested_device_features() requested_features.use_baking = bake_manager->get_baking(); requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH); requested_features.use_transparent &= scene->integrator->transparent_shadows; + requested_features.use_denoising = params.use_denoising; return requested_features; } -void Session::load_kernels() +void Session::load_kernels(bool lock_scene) { - thread_scoped_lock scene_lock(scene->mutex); + thread_scoped_lock scene_lock; + if(lock_scene) { + scene_lock = thread_scoped_lock(scene->mutex); + } + + DeviceRequestedFeatures requested_features = get_requested_device_features(); - if(!kernels_loaded) { + if(!kernels_loaded || loaded_kernel_features.modified(requested_features)) { progress.set_status("Loading render kernels (may take a few minutes the first time)"); scoped_timer timer; - DeviceRequestedFeatures requested_features = get_requested_device_features(); VLOG(2) << "Requested features:\n" << requested_features; if(!device->load_kernels(requested_features)) { string message = device->error_message(); @@ -676,6 +757,7 @@ void Session::load_kernels() VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start(); kernels_loaded = true; + loaded_kernel_features = requested_features; } } @@ -744,10 +826,10 @@ void Session::reset(BufferParams& buffer_params, int samples) if(params.progressive_refine) { thread_scoped_lock buffers_lock(buffers_mutex); - foreach(RenderBuffers *buffers, tile_buffers) - delete buffers; + foreach(RenderTile &rtile, render_tiles) + delete rtile.buffers; - tile_buffers.clear(); + render_tiles.clear(); } } @@ -826,6 +908,8 @@ void Session::update_scene() /* update scene */ if(scene->need_update()) { + load_kernels(false); + progress.set_status("Updating Scene"); MEM_GUARDED_CALL(&progress, scene->device_update, device, progress); } @@ -836,7 +920,7 @@ void Session::update_status_time(bool show_pause, bool show_done) int progressive_sample = tile_manager.state.sample; int num_samples = tile_manager.get_num_effective_samples(); - int tile = progress.get_finished_tiles(); + int tile = progress.get_rendered_tiles(); int num_tiles = tile_manager.state.num_tiles; /* update status */ @@ -844,11 +928,12 @@ void Session::update_status_time(bool show_pause, bool show_done) if(!params.progressive) { const bool is_cpu = params.device.type == DEVICE_CPU; + const bool rendering_finished = (tile == num_tiles); const bool is_last_tile = (tile + 1) == num_tiles; substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles); - if(device->show_samples() || (is_cpu && is_last_tile)) { + if(!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) { /* Some devices automatically support showing the sample number: * - CUDADevice * - OpenCLDevice when using the megakernel (the split kernel renders multiple @@ -860,6 +945,9 @@ void Session::update_status_time(bool show_pause, bool show_done) */ substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples); } + if(params.use_denoising) { + substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles()); + } } else if(tile_manager.num_samples == INT_MAX) substatus = string_printf("Path Tracing Sample %d", progressive_sample+1); @@ -873,6 +961,7 @@ void Session::update_status_time(bool show_pause, bool show_done) } else if(show_done) { status = "Done"; + progress.set_end_time(); /* Save end time so that further calls to get_time are accurate. */ } else { status = substatus; @@ -882,13 +971,15 @@ void Session::update_status_time(bool show_pause, bool show_done) progress.set_status(status, substatus); } -void Session::path_trace() +void Session::render() { /* add path trace task */ - DeviceTask task(DeviceTask::PATH_TRACE); + DeviceTask task(DeviceTask::RENDER); task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2); task.release_tile = function_bind(&Session::release_tile, this, _1); + task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2); + task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2); task.get_cancel = function_bind(&Progress::get_cancel, &this->progress); task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1); task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2); @@ -897,6 +988,18 @@ void Session::path_trace() task.requested_tile_size = params.tile_size; task.passes_size = tile_manager.params.get_passes_size(); + if(params.use_denoising) { + task.denoising_radius = params.denoising_radius; + task.denoising_strength = params.denoising_strength; + task.denoising_feature_strength = params.denoising_feature_strength; + task.denoising_relative_pca = params.denoising_relative_pca; + + assert(!scene->film->need_update); + task.pass_stride = scene->film->pass_stride; + task.pass_denoising_data = scene->film->denoising_data_offset; + task.pass_denoising_clean = scene->film->denoising_clean_offset; + } + device->task_add(task); } @@ -940,9 +1043,7 @@ bool Session::update_progressive_refine(bool cancel) } if(params.progressive_refine) { - foreach(RenderBuffers *buffers, tile_buffers) { - RenderTile rtile; - rtile.buffers = buffers; + foreach(RenderTile &rtile, render_tiles) { rtile.sample = sample; if(write) { @@ -951,7 +1052,7 @@ bool Session::update_progressive_refine(bool cancel) } else { if(update_render_tile_cb) - update_render_tile_cb(rtile); + update_render_tile_cb(rtile, true); } } } @@ -965,10 +1066,11 @@ void Session::device_free() { scene->device_free(); - foreach(RenderBuffers *buffers, tile_buffers) - delete buffers; + foreach(RenderTile &tile, render_tiles) + delete tile.buffers; + tile_manager.free_device(); - tile_buffers.clear(); + render_tiles.clear(); /* used from background render only, so no need to * re-create render/display buffers here diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index a7e5f78a64d..9f8bb8c42fa 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -57,6 +57,12 @@ public: bool display_buffer_linear; + bool use_denoising; + int denoising_radius; + float denoising_strength; + float denoising_feature_strength; + bool denoising_relative_pca; + double cancel_timeout; double reset_timeout; double text_timeout; @@ -77,6 +83,12 @@ public: start_resolution = INT_MAX; threads = 0; + use_denoising = false; + denoising_radius = 8; + denoising_strength = 0.0f; + denoising_feature_strength = 0.0f; + denoising_relative_pca = false; + display_buffer_linear = false; cancel_timeout = 0.1; @@ -126,7 +138,7 @@ public: Stats stats; function<void(RenderTile&)> write_render_tile_cb; - function<void(RenderTile&)> update_render_tile_cb; + function<void(RenderTile&, bool)> update_render_tile_cb; explicit Session(const SessionParams& params); ~Session(); @@ -141,7 +153,7 @@ public: void set_pause(bool pause); void update_scene(); - void load_kernels(); + void load_kernels(bool lock_scene=true); void device_free(); @@ -162,7 +174,7 @@ protected: void update_status_time(bool show_pause = false, bool show_done = false); void tonemap(int sample); - void path_trace(); + void render(); void reset_(BufferParams& params, int samples); void run_cpu(); @@ -177,6 +189,9 @@ protected: void update_tile_sample(RenderTile& tile); void release_tile(RenderTile& tile); + void map_neighbor_tiles(RenderTile *tiles, Device *tile_device); + void unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device); + bool device_use_gl; thread *session_thread; @@ -195,6 +210,7 @@ protected: thread_mutex display_mutex; bool kernels_loaded; + DeviceRequestedFeatures loaded_kernel_features; double reset_time; @@ -202,7 +218,7 @@ protected: double last_update_time; bool update_progressive_refine(bool cancel); - vector<RenderBuffers *> tile_buffers; + vector<RenderTile> render_tiles; DeviceRequestedFeatures get_requested_device_features(); diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index 23eee1916bd..44a266dfe18 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -49,6 +49,16 @@ static float beckmann_table_slope_max() return 6.0; } + +/* MSVC 2015 needs this ugly hack to prevent a codegen bug on x86 + * see T50176 for details + */ +#if defined(_MSC_VER) && (_MSC_VER == 1900) +# define MSVC_VOLATILE volatile +#else +# define MSVC_VOLATILE +#endif + /* Paper used: Importance Sampling Microfacet-Based BSDFs with the * Distribution of Visible Normals. Supplemental Material 2/2. * @@ -72,7 +82,7 @@ static void beckmann_table_rows(float *table, int row_from, int row_to) slope_x[0] = (double)-beckmann_table_slope_max(); CDF_P22_omega_i[0] = 0; - for(int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) { + for(MSVC_VOLATILE int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) { /* slope_x */ slope_x[index_slope_x] = (double)(-beckmann_table_slope_max() + 2.0f * beckmann_table_slope_max() * index_slope_x/(DATA_TMP_SIZE - 1.0f)); @@ -116,6 +126,8 @@ static void beckmann_table_rows(float *table, int row_from, int row_to) } } +#undef MSVC_VOLATILE + static void beckmann_table_build(vector<float>& table) { table.resize(BECKMANN_TABLE_SIZE*BECKMANN_TABLE_SIZE); @@ -178,6 +190,7 @@ Shader::Shader() has_volume_spatial_varying = false; has_object_dependency = false; has_integrator_dependency = false; + has_volume_connected = false; displacement_method = DISPLACE_BUMP; @@ -229,6 +242,10 @@ void Shader::set_graph(ShaderGraph *graph_) delete graph_bump; graph = graph_; graph_bump = NULL; + + /* Store info here before graph optimization to make sure that + * nodes that get optimized away still count. */ + has_volume_connected = (graph->output()->input("Volume")->link != NULL); } void Shader::tag_update(Scene *scene) @@ -319,11 +336,14 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem) (void)shadingsystem; /* Ignored when built without OSL. */ #ifdef WITH_OSL - if(shadingsystem == SHADINGSYSTEM_OSL) + if(shadingsystem == SHADINGSYSTEM_OSL) { manager = new OSLShaderManager(); + } else #endif + { manager = new SVMShaderManager(); + } add_default(scene); @@ -420,15 +440,14 @@ void ShaderManager::device_update_common(Device *device, flag |= SD_HAS_VOLUME; has_volumes = true; - /* in this case we can assume transparent surface */ - if(!shader->has_surface) - flag |= SD_HAS_ONLY_VOLUME; - /* todo: this could check more fine grained, to skip useless volumes * enclosed inside an opaque bsdf. */ flag |= SD_HAS_TRANSPARENT_SHADOW; } + /* in this case we can assume transparent surface */ + if(shader->has_volume_connected && !shader->has_surface) + flag |= SD_HAS_ONLY_VOLUME; if(shader->heterogeneous_volume && shader->has_volume_spatial_varying) flag |= SD_HETEROGENEOUS_VOLUME; if(shader->has_bssrdf_bump) @@ -569,6 +588,9 @@ void ShaderManager::get_requested_graph_features(ShaderGraph *graph, if(CLOSURE_IS_VOLUME(bsdf_node->closure)) { requested_features->nodes_features |= NODE_FEATURE_VOLUME; } + else if(CLOSURE_IS_PRINCIPLED(bsdf_node->closure)) { + requested_features->use_principled = true; + } } if(node->has_surface_bssrdf()) { requested_features->use_subsurface = true; diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h index a8018231f1a..b6714b13247 100644 --- a/intern/cycles/render/shader.h +++ b/intern/cycles/render/shader.h @@ -105,6 +105,15 @@ public: bool need_update; bool need_update_attributes; + /* If the shader has only volume components, the surface is assumed to + * be transparent. + * However, graph optimization might remove the volume subgraph, but + * since the user connected something to the volume output the surface + * should still be transparent. + * Therefore, has_volume_connected stores whether some volume subtree + * was connected before optimization. */ + bool has_volume_connected; + /* information about shader after compiling */ bool has_surface; bool has_surface_emission; diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index 944e746ca2d..176a1f4f0f3 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -25,37 +25,39 @@ namespace { class TileComparator { public: - TileComparator(TileOrder order, int2 center) - : order_(order), - center_(center) + TileComparator(TileOrder order_, int2 center_, Tile *tiles_) + : order(order_), + center(center_), + tiles(tiles_) {} - bool operator()(Tile &a, Tile &b) + bool operator()(int a, int b) { - switch(order_) { + switch(order) { case TILE_CENTER: { - float2 dist_a = make_float2(center_.x - (a.x + a.w/2), - center_.y - (a.y + a.h/2)); - float2 dist_b = make_float2(center_.x - (b.x + b.w/2), - center_.y - (b.y + b.h/2)); + float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w/2), + center.y - (tiles[a].y + tiles[a].h/2)); + float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w/2), + center.y - (tiles[b].y + tiles[b].h/2)); return dot(dist_a, dist_a) < dot(dist_b, dist_b); } case TILE_LEFT_TO_RIGHT: - return (a.x == b.x)? (a.y < b.y): (a.x < b.x); + return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x < tiles[b].x); case TILE_RIGHT_TO_LEFT: - return (a.x == b.x)? (a.y < b.y): (a.x > b.x); + return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x > tiles[b].x); case TILE_TOP_TO_BOTTOM: - return (a.y == b.y)? (a.x < b.x): (a.y > b.y); + return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y > tiles[b].y); case TILE_BOTTOM_TO_TOP: default: - return (a.y == b.y)? (a.x < b.x): (a.y < b.y); + return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y < tiles[b].y); } } protected: - TileOrder order_; - int2 center_; + TileOrder order; + int2 center; + Tile *tiles; }; inline int2 hilbert_index_to_pos(int n, int d) @@ -96,6 +98,7 @@ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, i num_devices = num_devices_; preserve_tile_device = preserve_tile_device_; background = background_; + schedule_denoising = false; range_start_sample = 0; range_num_samples = -1; @@ -108,6 +111,16 @@ TileManager::~TileManager() { } +void TileManager::free_device() +{ + if(schedule_denoising) { + for(int i = 0; i < state.tiles.size(); i++) { + delete state.tiles[i].buffers; + state.tiles[i].buffers = NULL; + } + } +} + static int get_divider(int w, int h, int start_resolution) { int divider = 1; @@ -133,6 +146,8 @@ void TileManager::reset(BufferParams& params_, int num_samples_) state.num_tiles = 0; state.num_samples = 0; state.resolution_divider = get_divider(params.width, params.height, start_resolution); + state.render_tiles.clear(); + state.denoising_tiles.clear(); state.tiles.clear(); } @@ -157,6 +172,9 @@ void TileManager::set_samples(int num_samples_) } state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height; + if(schedule_denoising) { + state.total_pixel_samples += params.width*params.height; + } } } @@ -169,32 +187,36 @@ int TileManager::gen_tiles(bool sliced) int image_h = max(1, params.height/resolution); int2 center = make_int2(image_w/2, image_h/2); - state.tiles.clear(); - int num_logical_devices = preserve_tile_device? num_devices: 1; int num = min(image_h, num_logical_devices); int slice_num = sliced? num: 1; - int tile_index = 0; + int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x); state.tiles.clear(); - state.tiles.resize(num); - vector<list<Tile> >::iterator tile_list = state.tiles.begin(); + state.render_tiles.clear(); + state.denoising_tiles.clear(); + state.render_tiles.resize(num); + state.denoising_tiles.resize(num); + state.tile_stride = tile_w; + vector<list<int> >::iterator tile_list; + tile_list = state.render_tiles.begin(); if(tile_order == TILE_HILBERT_SPIRAL) { assert(!sliced); + int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y); + state.tiles.resize(tile_w*tile_h); + /* Size of blocks in tiles, must be a power of 2 */ const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12)? 8: 4; - int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x; - int tile_h = (tile_size.y >= image_h)? 1: (image_h + tile_size.y - 1)/tile_size.y; - int tiles_per_device = (tile_w * tile_h + num - 1) / num; + int tiles_per_device = divide_up(tile_w * tile_h, num); int cur_device = 0, cur_tiles = 0; int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size); /* Number of blocks to fill the image */ - int blocks_x = (block_size.x >= image_w)? 1: (image_w + block_size.x - 1)/block_size.x; - int blocks_y = (block_size.y >= image_h)? 1: (image_h + block_size.y - 1)/block_size.y; + int blocks_x = (block_size.x >= image_w)? 1: divide_up(image_w, block_size.x); + int blocks_y = (block_size.y >= image_h)? 1: divide_up(image_h, block_size.y); int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */ /* Offset of spiral (to keep it centered) */ int2 offset = make_int2((image_w - n*block_size.x)/2, (image_h - n*block_size.y)/2); @@ -225,9 +247,11 @@ int TileManager::gen_tiles(bool sliced) if(pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) { int w = min(tile_size.x, image_w - pos.x); int h = min(tile_size.y, image_h - pos.y); - tile_list->push_front(Tile(tile_index, pos.x, pos.y, w, h, cur_device)); + int2 ipos = pos / tile_size; + int idx = ipos.y*tile_w + ipos.x; + state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER); + tile_list->push_front(idx); cur_tiles++; - tile_index++; if(cur_tiles == tiles_per_device) { tile_list++; @@ -271,27 +295,28 @@ int TileManager::gen_tiles(bool sliced) break; } } - return tile_index; + return tile_w*tile_h; } + int idx = 0; for(int slice = 0; slice < slice_num; slice++) { int slice_y = (image_h/slice_num)*slice; int slice_h = (slice == slice_num-1)? image_h - slice*(image_h/slice_num): image_h/slice_num; - int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x; - int tile_h = (tile_size.y >= slice_h)? 1: (slice_h + tile_size.y - 1)/tile_size.y; + int tile_h = (tile_size.y >= slice_h)? 1: divide_up(slice_h, tile_size.y); - int tiles_per_device = (tile_w * tile_h + num - 1) / num; + int tiles_per_device = divide_up(tile_w * tile_h, num); int cur_device = 0, cur_tiles = 0; for(int tile_y = 0; tile_y < tile_h; tile_y++) { - for(int tile_x = 0; tile_x < tile_w; tile_x++, tile_index++) { + for(int tile_x = 0; tile_x < tile_w; tile_x++, idx++) { int x = tile_x * tile_size.x; int y = tile_y * tile_size.y; int w = (tile_x == tile_w-1)? image_w - x: tile_size.x; int h = (tile_y == tile_h-1)? slice_h - y: tile_size.y; - tile_list->push_back(Tile(tile_index, x, y + slice_y, w, h, sliced? slice: cur_device)); + state.tiles.push_back(Tile(idx, x, y + slice_y, w, h, sliced? slice: cur_device, Tile::RENDER)); + tile_list->push_back(idx); if(!sliced) { cur_tiles++; @@ -299,7 +324,7 @@ int TileManager::gen_tiles(bool sliced) if(cur_tiles == tiles_per_device) { /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */ if(tile_order != TILE_BOTTOM_TO_TOP) { - tile_list->sort(TileComparator(tile_order, center)); + tile_list->sort(TileComparator(tile_order, center, &state.tiles[0])); } tile_list++; cur_tiles = 0; @@ -313,7 +338,7 @@ int TileManager::gen_tiles(bool sliced) } } - return tile_index; + return idx; } void TileManager::set_tiles() @@ -333,15 +358,111 @@ void TileManager::set_tiles() state.buffer.full_height = max(1, params.full_height/resolution); } -bool TileManager::next_tile(Tile& tile, int device) +int TileManager::get_neighbor_index(int index, int neighbor) +{ + static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0}; + + int resolution = state.resolution_divider; + int image_w = max(1, params.width/resolution); + int image_h = max(1, params.height/resolution); + int tile_w = (tile_size.x >= image_w)? 1: divide_up(image_w, tile_size.x); + int tile_h = (tile_size.y >= image_h)? 1: divide_up(image_h, tile_size.y); + + int nx = state.tiles[index].x/tile_size.x + dx[neighbor], ny = state.tiles[index].y/tile_size.y + dy[neighbor]; + if(nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h) + return -1; + + return ny*state.tile_stride + nx; +} + +/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state min_state. */ +bool TileManager::check_neighbor_state(int index, Tile::State min_state) +{ + if(index < 0 || state.tiles[index].state < min_state) { + return false; + } + for(int neighbor = 0; neighbor < 9; neighbor++) { + int nindex = get_neighbor_index(index, neighbor); + /* Out-of-bounds tiles don't matter. */ + if(nindex >= 0 && state.tiles[nindex].state < min_state) { + return false; + } + } + + return true; +} + +/* Returns whether the tile should be written (and freed if no denoising is used) instead of updating. */ +bool TileManager::finish_tile(int index, bool &delete_tile) +{ + delete_tile = false; + + switch(state.tiles[index].state) { + case Tile::RENDER: + { + if(!schedule_denoising) { + state.tiles[index].state = Tile::DONE; + delete_tile = true; + return true; + } + state.tiles[index].state = Tile::RENDERED; + /* For each neighbor and the tile itself, check whether all of its neighbors have been rendered. If yes, it can be denoised. */ + for(int neighbor = 0; neighbor < 9; neighbor++) { + int nindex = get_neighbor_index(index, neighbor); + if(check_neighbor_state(nindex, Tile::RENDERED)) { + state.tiles[nindex].state = Tile::DENOISE; + state.denoising_tiles[state.tiles[nindex].device].push_back(nindex); + } + } + return false; + } + case Tile::DENOISE: + { + state.tiles[index].state = Tile::DENOISED; + /* For each neighbor and the tile itself, check whether all of its neighbors have been denoised. If yes, it can be freed. */ + for(int neighbor = 0; neighbor < 9; neighbor++) { + int nindex = get_neighbor_index(index, neighbor); + if(check_neighbor_state(nindex, Tile::DENOISED)) { + state.tiles[nindex].state = Tile::DONE; + /* It can happen that the tile just finished denoising and already can be freed here. + * However, in that case it still has to be written before deleting, so we can't delete it yet. */ + if(neighbor == 8) { + delete_tile = true; + } + else { + delete state.tiles[nindex].buffers; + state.tiles[nindex].buffers = NULL; + } + } + } + return true; + } + default: + assert(false); + return true; + } +} + +bool TileManager::next_tile(Tile* &tile, int device) { int logical_device = preserve_tile_device? device: 0; - if((logical_device >= state.tiles.size()) || state.tiles[logical_device].empty()) + if(logical_device >= state.render_tiles.size()) + return false; + + if(!state.denoising_tiles[logical_device].empty()) { + int idx = state.denoising_tiles[logical_device].front(); + state.denoising_tiles[logical_device].pop_front(); + tile = &state.tiles[idx]; + return true; + } + + if(state.render_tiles[logical_device].empty()) return false; - tile = Tile(state.tiles[logical_device].front()); - state.tiles[logical_device].pop_front(); + int idx = state.render_tiles[logical_device].front(); + state.render_tiles[logical_device].pop_front(); + tile = &state.tiles[idx]; return true; } diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index 622b89f7670..e39a8f0627a 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -31,12 +31,20 @@ public: int index; int x, y, w, h; int device; + /* RENDER: The tile has to be rendered. + * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors). + * DENOISE: The tile can be denoised now. + * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors). + * DONE: The tile is finished and has been freed. */ + typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State; + State state; + RenderBuffers *buffers; Tile() {} - Tile(int index_, int x_, int y_, int w_, int h_, int device_) - : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_) {} + Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER) + : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL) {} }; /* Tile order */ @@ -58,6 +66,8 @@ public: BufferParams params; struct State { + vector<Tile> tiles; + int tile_stride; BufferParams buffer; int sample; int num_samples; @@ -67,9 +77,12 @@ public: /* Total samples over all pixels: Generally num_samples*num_pixels, * but can be higher due to the initial resolution division for previews. */ uint64_t total_pixel_samples; - /* This vector contains a list of tiles for every logical device in the session. - * In each list, the tiles are sorted according to the tile order setting. */ - vector<list<Tile> > tiles; + + /* These lists contain the indices of the tiles to be rendered/denoised and are used + * when acquiring a new tile for the device. + * Each list in each vector is for one logical device. */ + vector<list<int> > render_tiles; + vector<list<int> > denoising_tiles; } state; int num_samples; @@ -78,10 +91,12 @@ public: bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1); ~TileManager(); + void free_device(); void reset(BufferParams& params, int num_samples); void set_samples(int num_samples); bool next(); - bool next_tile(Tile& tile, int device = 0); + bool next_tile(Tile* &tile, int device = 0); + bool finish_tile(int index, bool& delete_tile); bool done(); void set_tile_order(TileOrder tile_order_) { tile_order = tile_order_; } @@ -96,6 +111,9 @@ public: /* Get number of actual samples to render. */ int get_num_effective_samples(); + + /* Schedule tiles for denoising after they've been rendered. */ + bool schedule_denoising; protected: void set_tiles(); @@ -127,6 +145,9 @@ protected: /* Generate tile list, return number of tiles. */ int gen_tiles(bool sliced); + + int get_neighbor_index(int index, int neighbor); + bool check_neighbor_state(int index, Tile::State state); }; CCL_NAMESPACE_END diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp index 22ec8e0ee8e..6c059ba5d12 100644 --- a/intern/cycles/test/util_string_test.cpp +++ b/intern/cycles/test/util_string_test.cpp @@ -245,4 +245,41 @@ TEST(util_string_remove_trademark, both) EXPECT_EQ(str, "foo bar zzz"); } +TEST(util_string_remove_trademark, both_space) +{ + string str = string_remove_trademark("foo bar(TM) (R) zzz"); + EXPECT_EQ(str, "foo bar zzz"); +} + +TEST(util_string_remove_trademark, both_space_around) +{ + string str = string_remove_trademark("foo bar (TM) (R) zzz"); + EXPECT_EQ(str, "foo bar zzz"); +} + +TEST(util_string_remove_trademark, trademark_space_suffix) +{ + string str = string_remove_trademark("foo bar (TM)"); + EXPECT_EQ(str, "foo bar"); +} + +TEST(util_string_remove_trademark, trademark_space_middle) +{ + string str = string_remove_trademark("foo bar (TM) baz"); + EXPECT_EQ(str, "foo bar baz"); +} + + +TEST(util_string_remove_trademark, r_space_suffix) +{ + string str = string_remove_trademark("foo bar (R)"); + EXPECT_EQ(str, "foo bar"); +} + +TEST(util_string_remove_trademark, r_space_middle) +{ + string str = string_remove_trademark("foo bar (R) baz"); + EXPECT_EQ(str, "foo bar baz"); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index a015fef8284..43f9a57d099 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -53,6 +53,13 @@ set(SRC_HEADERS util_math_cdf.h util_math_fast.h util_math_intersect.h + util_math_float2.h + util_math_float3.h + util_math_float4.h + util_math_int2.h + util_math_int3.h + util_math_int4.h + util_math_matrix.h util_md5.h util_opengl.h util_optimization.h @@ -80,6 +87,32 @@ set(SRC_HEADERS util_time.h util_transform.h util_types.h + util_types_float2.h + util_types_float2_impl.h + util_types_float3.h + util_types_float3_impl.h + util_types_float4.h + util_types_float4_impl.h + util_types_int2.h + util_types_int2_impl.h + util_types_int3.h + util_types_int3_impl.h + util_types_int4.h + util_types_int4_impl.h + util_types_uchar2.h + util_types_uchar2_impl.h + util_types_uchar3.h + util_types_uchar3_impl.h + util_types_uchar4.h + util_types_uchar4_impl.h + util_types_uint2.h + util_types_uint2_impl.h + util_types_uint3.h + util_types_uint3_impl.h + util_types_uint4.h + util_types_uint4_impl.h + util_types_vector3.h + util_types_vector3_impl.h util_vector.h util_version.h util_view.h diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index 6c52117ef9a..643af87a65f 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -35,6 +35,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value) #define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x)) #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) +#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1) #define CCL_LOCAL_MEM_FENCE 0 #define ccl_barrier(flags) (void)0 @@ -68,6 +69,7 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so #define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x)) #define atomic_fetch_and_inc_uint32(p) atomic_inc((p)) +#define atomic_fetch_and_dec_uint32(p) atomic_dec((p)) #define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE #define ccl_barrier(flags) barrier(flags) @@ -79,7 +81,9 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so #define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x)) #define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x)) +#define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int*)(p), (unsigned int)(x)) #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) +#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1) #define CCL_LOCAL_MEM_FENCE #define ccl_barrier(flags) __syncthreads() diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h index 4d673dc34d8..c73beab98dc 100644 --- a/intern/cycles/util/util_color.h +++ b/intern/cycles/util/util_color.h @@ -157,16 +157,6 @@ ccl_device float3 xyz_to_rgb(float x, float y, float z) 0.055648f * x + -0.204043f * y + 1.057311f * z); } -#ifndef __KERNEL_OPENCL__ - -ccl_device float3 color_srgb_to_scene_linear(float3 c) -{ - return make_float3( - color_srgb_to_scene_linear(c.x), - color_srgb_to_scene_linear(c.y), - color_srgb_to_scene_linear(c.z)); -} - #ifdef __KERNEL_SSE2__ /* * Calculate initial guess for arg^exp based on float representation @@ -222,17 +212,38 @@ ccl_device ssef color_srgb_to_scene_linear(const ssef &c) ssef gte = fastpow24(gtebase); return select(cmp, lt, gte); } -#endif +#endif /* __KERNEL_SSE2__ */ -ccl_device float3 color_scene_linear_to_srgb(float3 c) +ccl_device float3 color_srgb_to_scene_linear_v3(float3 c) { - return make_float3( - color_scene_linear_to_srgb(c.x), - color_scene_linear_to_srgb(c.y), - color_scene_linear_to_srgb(c.z)); + return make_float3(color_srgb_to_scene_linear(c.x), + color_srgb_to_scene_linear(c.y), + color_srgb_to_scene_linear(c.z)); } +ccl_device float3 color_scene_linear_to_srgb_v3(float3 c) +{ + return make_float3(color_scene_linear_to_srgb(c.x), + color_scene_linear_to_srgb(c.y), + color_scene_linear_to_srgb(c.z)); +} + +ccl_device float4 color_srgb_to_scene_linear_v4(float4 c) +{ +#ifdef __KERNEL_SSE2__ + ssef r_ssef; + float4 &r = (float4 &)r_ssef; + r = c; + r_ssef = color_srgb_to_scene_linear(r_ssef); + r.w = c.w; + return r; +#else + return make_float4(color_srgb_to_scene_linear(c.x), + color_srgb_to_scene_linear(c.y), + color_srgb_to_scene_linear(c.z), + c.w); #endif +} ccl_device float linear_rgb_to_gray(float3 c) { diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 9cfa57dd741..10895f2e918 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -118,7 +118,7 @@ void DebugFlags::OpenCL::reset() } /* Initialize other flags from environment variables. */ debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL); - single_program = (getenv("CYCLES_OPENCL_SINGLE_PROGRAM") != NULL); + single_program = (getenv("CYCLES_OPENCL_MULTI_PROGRAM") == NULL); } DebugFlags::DebugFlags() @@ -184,8 +184,8 @@ std::ostream& operator <<(std::ostream &os, << " Device type : " << opencl_device_type << "\n" << " Kernel type : " << opencl_kernel_type << "\n" << " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n" - << " Signle program : " << string_from_bool(debug_flags.opencl.single_program) - << "\n"; + << " Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n" + << " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n"; return os; } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 4505d584490..450cd900a9f 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -115,6 +115,10 @@ public: /* Use single program */ bool single_program; + + /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */ + /* Artificial memory limit in bytes (0 if disabled). */ + size_t mem_limit; }; /* Get instance of debug flags registry. */ diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h index 5f9dcfb2481..1abcabd5294 100644 --- a/intern/cycles/util/util_guarded_allocator.h +++ b/intern/cycles/util/util_guarded_allocator.h @@ -50,9 +50,9 @@ public: T *allocate(size_t n, const void *hint = 0) { + (void)hint; size_t size = n * sizeof(T); util_guarded_mem_alloc(size); - (void)hint; if(n == 0) { return NULL; } diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp index a5a3bd34fff..f38683bf7de 100644 --- a/intern/cycles/util/util_logging.cpp +++ b/intern/cycles/util/util_logging.cpp @@ -30,10 +30,10 @@ void util_logging_init(const char *argv0) #ifdef WITH_CYCLES_LOGGING using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption; - /* Make it so FATAL messages are always print into console. */ + /* Make it so ERROR messages are always print into console. */ char severity_fatal[32]; snprintf(severity_fatal, sizeof(severity_fatal), "%d", - google::GLOG_FATAL); + google::GLOG_ERROR); google::InitGoogleLogging(argv0); SetCommandLineOption("logtostderr", "1"); diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h index ecf9c9cfee0..492f830e67c 100644 --- a/intern/cycles/util/util_logging.h +++ b/intern/cycles/util/util_logging.h @@ -19,28 +19,30 @@ #if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__) # include <glog/logging.h> -#else -# include <iostream> #endif +#include <iostream> + CCL_NAMESPACE_BEGIN #if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__) -class StubStream : public std::ostream { - public: - StubStream() : std::ostream(NULL) { } +class StubStream { +public: + template<class T> + StubStream& operator<<(const T&) { + return *this; + } }; class LogMessageVoidify { public: LogMessageVoidify() { } - void operator&(::std::ostream&) { } + void operator&(StubStream&) { } }; # define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream() # define LOG(severity) LOG_SUPPRESS() # define VLOG(severity) LOG_SUPPRESS() - #endif #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level) diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index e0305b978b9..b719640b19c 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -28,12 +28,10 @@ #ifndef __KERNEL_OPENCL__ - -#include <float.h> -#include <math.h> -#include <stdio.h> - -#endif +# include <float.h> +# include <math.h> +# include <stdio.h> +#endif /* __KERNEL_OPENCL__ */ #include "util/util_types.h" @@ -43,49 +41,44 @@ CCL_NAMESPACE_BEGIN /* Division */ #ifndef M_PI_F -#define M_PI_F (3.1415926535897932f) /* pi */ +# define M_PI_F (3.1415926535897932f) /* pi */ #endif #ifndef M_PI_2_F -#define M_PI_2_F (1.5707963267948966f) /* pi/2 */ +# define M_PI_2_F (1.5707963267948966f) /* pi/2 */ #endif #ifndef M_PI_4_F -#define M_PI_4_F (0.7853981633974830f) /* pi/4 */ +# define M_PI_4_F (0.7853981633974830f) /* pi/4 */ #endif #ifndef M_1_PI_F -#define M_1_PI_F (0.3183098861837067f) /* 1/pi */ +# define M_1_PI_F (0.3183098861837067f) /* 1/pi */ #endif #ifndef M_2_PI_F -#define M_2_PI_F (0.6366197723675813f) /* 2/pi */ +# define M_2_PI_F (0.6366197723675813f) /* 2/pi */ #endif /* Multiplication */ #ifndef M_2PI_F -#define M_2PI_F (6.2831853071795864f) /* 2*pi */ +# define M_2PI_F (6.2831853071795864f) /* 2*pi */ #endif #ifndef M_4PI_F -#define M_4PI_F (12.566370614359172f) /* 4*pi */ +# define M_4PI_F (12.566370614359172f) /* 4*pi */ #endif /* Float sqrt variations */ - #ifndef M_SQRT2_F -#define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */ +# define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */ #endif - #ifndef M_LN2_F -#define M_LN2_F (0.6931471805599453f) /* ln(2) */ +# define M_LN2_F (0.6931471805599453f) /* ln(2) */ #endif - #ifndef M_LN10_F -#define M_LN10_F (2.3025850929940457f) /* ln(10) */ +# define M_LN10_F (2.3025850929940457f) /* ln(10) */ #endif /* Scalar */ #ifdef _WIN32 - -#ifndef __KERNEL_OPENCL__ - +# ifndef __KERNEL_OPENCL__ ccl_device_inline float fmaxf(float a, float b) { return (a > b)? a: b; @@ -95,13 +88,10 @@ ccl_device_inline float fminf(float a, float b) { return (a < b)? a: b; } - -#endif - -#endif +# endif /* !__KERNEL_OPENCL__ */ +#endif /* _WIN32 */ #ifndef __KERNEL_GPU__ - using std::isfinite; using std::isnan; @@ -157,8 +147,7 @@ ccl_device_inline T max4(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } - -#endif +#endif /* __KERNEL_GPU__ */ ccl_device_inline float min4(float a, float b, float c, float d) { @@ -170,525 +159,141 @@ ccl_device_inline float max4(float a, float b, float c, float d) return max(max(a, b), max(c, d)); } -ccl_device_inline float max3(float3 a) -{ - return max(max(a.x, a.y), a.z); -} - #ifndef __KERNEL_OPENCL__ +/* Int/Float conversion */ -ccl_device_inline int clamp(int a, int mn, int mx) -{ - return min(max(a, mn), mx); -} - -ccl_device_inline float clamp(float a, float mn, float mx) -{ - return min(max(a, mn), mx); -} - -ccl_device_inline float mix(float a, float b, float t) -{ - return a + t*(b - a); -} - -#endif - -#ifndef __KERNEL_CUDA__ - -ccl_device_inline float saturate(float a) -{ - return clamp(a, 0.0f, 1.0f); -} - -#endif - -ccl_device_inline int float_to_int(float f) -{ - return (int)f; -} - -ccl_device_inline int floor_to_int(float f) -{ - return float_to_int(floorf(f)); -} - -ccl_device_inline int ceil_to_int(float f) -{ - return float_to_int(ceilf(f)); -} - -ccl_device_inline float signf(float f) -{ - return (f < 0.0f)? -1.0f: 1.0f; -} - -ccl_device_inline float nonzerof(float f, float eps) -{ - if(fabsf(f) < eps) - return signf(f)*eps; - else - return f; -} - -ccl_device_inline float smoothstepf(float f) -{ - float ff = f*f; - return (3.0f*ff - 2.0f*ff*f); -} - -ccl_device_inline int mod(int x, int m) -{ - return (x % m + m) % m; -} - -/* Float2 Vector */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline bool is_zero(const float2& a) -{ - return (a.x == 0.0f && a.y == 0.0f); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float average(const float2& a) -{ - return (a.x + a.y)*(1.0f/2.0f); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float2 operator-(const float2& a) -{ - return make_float2(-a.x, -a.y); -} - -ccl_device_inline float2 operator*(const float2& a, const float2& b) -{ - return make_float2(a.x*b.x, a.y*b.y); -} - -ccl_device_inline float2 operator*(const float2& a, float f) -{ - return make_float2(a.x*f, a.y*f); -} - -ccl_device_inline float2 operator*(float f, const float2& a) -{ - return make_float2(a.x*f, a.y*f); -} - -ccl_device_inline float2 operator/(float f, const float2& a) -{ - return make_float2(f/a.x, f/a.y); -} - -ccl_device_inline float2 operator/(const float2& a, float f) -{ - float invf = 1.0f/f; - return make_float2(a.x*invf, a.y*invf); -} - -ccl_device_inline float2 operator/(const float2& a, const float2& b) +ccl_device_inline int as_int(uint i) { - return make_float2(a.x/b.x, a.y/b.y); + union { uint ui; int i; } u; + u.ui = i; + return u.i; } -ccl_device_inline float2 operator+(const float2& a, const float2& b) +ccl_device_inline uint as_uint(int i) { - return make_float2(a.x+b.x, a.y+b.y); + union { uint ui; int i; } u; + u.i = i; + return u.ui; } -ccl_device_inline float2 operator-(const float2& a, const float2& b) +ccl_device_inline uint as_uint(float f) { - return make_float2(a.x-b.x, a.y-b.y); + union { uint i; float f; } u; + u.f = f; + return u.i; } -ccl_device_inline float2 operator+=(float2& a, const float2& b) +ccl_device_inline int __float_as_int(float f) { - return a = a + b; + union { int i; float f; } u; + u.f = f; + return u.i; } -ccl_device_inline float2 operator*=(float2& a, const float2& b) +ccl_device_inline float __int_as_float(int i) { - return a = a * b; + union { int i; float f; } u; + u.i = i; + return u.f; } -ccl_device_inline float2 operator*=(float2& a, float f) +ccl_device_inline uint __float_as_uint(float f) { - return a = a * f; + union { uint i; float f; } u; + u.f = f; + return u.i; } -ccl_device_inline float2 operator/=(float2& a, const float2& b) +ccl_device_inline float __uint_as_float(uint i) { - return a = a / b; + union { uint i; float f; } u; + u.i = i; + return u.f; } +#endif /* __KERNEL_OPENCL__ */ -ccl_device_inline float2 operator/=(float2& a, float f) +/* Versions of functions which are safe for fast math. */ +ccl_device_inline bool isnan_safe(float f) { - float invf = 1.0f/f; - return a = a * invf; + unsigned int x = __float_as_uint(f); + return (x << 1) > 0xff000000u; } - -ccl_device_inline float dot(const float2& a, const float2& b) +ccl_device_inline bool isfinite_safe(float f) { - return a.x*b.x + a.y*b.y; + /* By IEEE 754 rule, 2*Inf equals Inf */ + unsigned int x = __float_as_uint(f); + return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); } -ccl_device_inline float cross(const float2& a, const float2& b) +ccl_device_inline float ensure_finite(float v) { - return (a.x*b.y - a.y*b.x); + return isfinite_safe(v)? v : 0.0f; } -#endif - #ifndef __KERNEL_OPENCL__ - -ccl_device_inline bool operator==(const int2 a, const int2 b) -{ - return (a.x == b.x && a.y == b.y); -} - -ccl_device_inline float len(const float2& a) -{ - return sqrtf(dot(a, a)); -} - -ccl_device_inline float2 normalize(const float2& a) -{ - return a/len(a); -} - -ccl_device_inline float2 normalize_len(const float2& a, float *t) -{ - *t = len(a); - return a/(*t); -} - -ccl_device_inline float2 safe_normalize(const float2& a) -{ - float t = len(a); - return (t != 0.0f)? a/t: a; -} - -ccl_device_inline bool operator==(const float2& a, const float2& b) -{ - return (a.x == b.x && a.y == b.y); -} - -ccl_device_inline bool operator!=(const float2& a, const float2& b) -{ - return !(a == b); -} - -ccl_device_inline float2 min(const float2& a, const float2& b) -{ - return make_float2(min(a.x, b.x), min(a.y, b.y)); -} - -ccl_device_inline float2 max(const float2& a, const float2& b) -{ - return make_float2(max(a.x, b.x), max(a.y, b.y)); -} - -ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx) +ccl_device_inline int clamp(int a, int mn, int mx) { return min(max(a, mn), mx); } -ccl_device_inline float2 fabs(const float2& a) -{ - return make_float2(fabsf(a.x), fabsf(a.y)); -} - -ccl_device_inline float2 as_float2(const float4& a) -{ - return make_float2(a.x, a.y); -} - -#endif - -#ifndef __KERNEL_GPU__ - -ccl_device_inline void print_float2(const char *label, const float2& a) -{ - printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float2 interp(const float2& a, const float2& b, float t) -{ - return a + t*(b - a); -} - -#endif - -/* Float3 Vector */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float3 operator-(const float3& a) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); -#else - return make_float3(-a.x, -a.y, -a.z); -#endif -} - -ccl_device_inline float3 operator*(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_mul_ps(a.m128,b.m128)); -#else - return make_float3(a.x*b.x, a.y*b.y, a.z*b.z); -#endif -} - -ccl_device_inline float3 operator*(const float3& a, const float f) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f))); -#else - return make_float3(a.x*f, a.y*f, a.z*f); -#endif -} - -ccl_device_inline float3 operator*(const float f, const float3& a) -{ - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); -#else - return make_float3(a.x*f, a.y*f, a.z*f); -#endif -} - -ccl_device_inline float3 operator/(const float f, const float3& a) -{ - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(a.m128); - return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); -#else - return make_float3(f / a.x, f / a.y, f / a.z); -#endif -} - -ccl_device_inline float3 operator/(const float3& a, const float f) -{ - float invf = 1.0f/f; - return a * invf; -} - -ccl_device_inline float3 operator/(const float3& a, const float3& b) -{ - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(b.m128); - return float3(_mm_mul_ps(a, rc)); -#else - return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); -#endif -} - -ccl_device_inline float3 operator+(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_add_ps(a.m128, b.m128)); -#else - return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); -#endif -} - -ccl_device_inline float3 operator-(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_sub_ps(a.m128, b.m128)); -#else - return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); -#endif -} - -ccl_device_inline float3 operator+=(float3& a, const float3& b) -{ - return a = a + b; -} - -ccl_device_inline float3 operator*=(float3& a, const float3& b) -{ - return a = a * b; -} - -ccl_device_inline float3 operator*=(float3& a, float f) -{ - return a = a * f; -} - -ccl_device_inline float3 operator/=(float3& a, const float3& b) -{ - return a = a / b; -} - -ccl_device_inline float3 operator/=(float3& a, float f) -{ - float invf = 1.0f/f; - return a = a * invf; -} - -ccl_device_inline float dot(const float3& a, const float3& b) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); -#else - return a.x*b.x + a.y*b.y + a.z*b.z; -#endif -} - -ccl_device_inline float dot_xy(const float3& a, const float3& b) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b)); -#else - return a.x*b.x + a.y*b.y; -#endif -} - -ccl_device_inline float dot(const float4& a, const float4& b) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); -#else - return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w); -#endif -} - -ccl_device_inline float3 cross(const float3& a, const float3& b) -{ - float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); - return r; -} - -#endif - -ccl_device_inline float len(const float3 a) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F))); -#else - return sqrtf(dot(a, a)); -#endif -} - -ccl_device_inline float len_squared(const float3 a) -{ - return dot(a, a); -} - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float len_squared(const float4& a) -{ - return dot(a, a); -} - -ccl_device_inline float3 normalize(const float3& a) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); - return _mm_div_ps(a.m128, norm); -#else - return a/len(a); -#endif -} - -#endif - -ccl_device_inline float3 saturate3(float3 a) +ccl_device_inline float clamp(float a, float mn, float mx) { - return make_float3(saturate(a.x), saturate(a.y), saturate(a.z)); + return min(max(a, mn), mx); } -ccl_device_inline float3 normalize_len(const float3 a, float *t) +ccl_device_inline float mix(float a, float b, float t) { - *t = len(a); - float x = 1.0f / *t; - return a*x; + return a + t*(b - a); } +#endif /* __KERNEL_OPENCL__ */ -ccl_device_inline float3 safe_normalize(const float3 a) +#ifndef __KERNEL_CUDA__ +ccl_device_inline float saturate(float a) { - float t = len(a); - return (t != 0.0f)? a * (1.0f/t) : a; + return clamp(a, 0.0f, 1.0f); } +#endif /* __KERNEL_CUDA__ */ -ccl_device_inline float3 safe_normalize_len(const float3 a, float *t) +ccl_device_inline int float_to_int(float f) { - *t = len(a); - return (*t != 0.0f)? a/(*t): a; + return (int)f; } -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline bool operator==(const float3& a, const float3& b) +ccl_device_inline int floor_to_int(float f) { -#ifdef __KERNEL_SSE__ - return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; -#else - return (a.x == b.x && a.y == b.y && a.z == b.z); -#endif + return float_to_int(floorf(f)); } -ccl_device_inline bool operator!=(const float3& a, const float3& b) +ccl_device_inline int ceil_to_int(float f) { - return !(a == b); + return float_to_int(ceilf(f)); } -ccl_device_inline float3 min(const float3& a, const float3& b) +ccl_device_inline float signf(float f) { -#ifdef __KERNEL_SSE__ - return _mm_min_ps(a.m128, b.m128); -#else - return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif + return (f < 0.0f)? -1.0f: 1.0f; } -ccl_device_inline float3 max(const float3& a, const float3& b) +ccl_device_inline float nonzerof(float f, float eps) { -#ifdef __KERNEL_SSE__ - return _mm_max_ps(a.m128, b.m128); -#else - return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif + if(fabsf(f) < eps) + return signf(f)*eps; + else + return f; } -ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx) +ccl_device_inline float smoothstepf(float f) { - return min(max(a, mn), mx); + float ff = f*f; + return (3.0f*ff - 2.0f*ff*f); } -ccl_device_inline float3 fabs(const float3& a) +ccl_device_inline int mod(int x, int m) { -#ifdef __KERNEL_SSE__ - __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); -#else - return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); -#endif + return (x % m + m) % m; } -#endif - ccl_device_inline float3 float2_to_float3(const float2 a) { return make_float3(a.x, a.y, 0.0f); @@ -704,546 +309,19 @@ ccl_device_inline float4 float3_to_float4(const float3 a) return make_float4(a.x, a.y, a.z, 1.0f); } -#ifndef __KERNEL_GPU__ - -ccl_device_inline void print_float3(const char *label, const float3& a) -{ - printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z); -} - -ccl_device_inline float3 rcp(const float3& a) -{ -#ifdef __KERNEL_SSE__ - float4 r = _mm_rcp_ps(a.m128); - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#else - return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); -#endif -} - -#endif - -ccl_device_inline float3 interp(float3 a, float3 b, float t) -{ - return a + t*(b - a); -} - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float3 mix(const float3& a, const float3& b, float t) -{ - return a + t*(b - a); -} - -#endif - -ccl_device_inline bool is_zero(const float3 a) -{ -#ifdef __KERNEL_SSE__ - return a == make_float3(0.0f); -#else - return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f); -#endif -} - -ccl_device_inline float reduce_add(const float3 a) -{ - return (a.x + a.y + a.z); -} - -ccl_device_inline float average(const float3 a) -{ - return reduce_add(a)*(1.0f/3.0f); -} - -ccl_device_inline bool isequal_float3(const float3 a, const float3 b) -{ -#ifdef __KERNEL_OPENCL__ - return all(a == b); -#else - return a == b; -#endif -} - -/* Float4 Vector */ - -#ifdef __KERNEL_SSE__ - -template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b) -{ - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))); -} - -#if defined(__KERNEL_SSE3__) -template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) -{ - return _mm_moveldup_ps(b); -} - -template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b) -{ - return _mm_movehdup_ps(b); -} -#endif - -template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) -{ - return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float4 operator-(const float4& a) -{ -#ifdef __KERNEL_SSE__ - __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); -#else - return make_float4(-a.x, -a.y, -a.z, -a.w); -#endif -} - -ccl_device_inline float4 operator*(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_mul_ps(a.m128, b.m128); -#else - return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); -#endif -} - -ccl_device_inline float4 operator*(const float4& a, float f) -{ -#if defined(__KERNEL_SSE__) - return a * make_float4(f); -#else - return make_float4(a.x*f, a.y*f, a.z*f, a.w*f); -#endif -} - -ccl_device_inline float4 operator*(float f, const float4& a) -{ - return a * f; -} - -ccl_device_inline float4 rcp(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 r = _mm_rcp_ps(a.m128); - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#else - return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); -#endif -} - -ccl_device_inline float4 operator/(const float4& a, float f) -{ - return a * (1.0f/f); -} - -ccl_device_inline float4 operator/(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return a * rcp(b); -#else - return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); -#endif - -} - -ccl_device_inline float4 operator+(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_add_ps(a.m128, b.m128); -#else - return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -#endif -} - -ccl_device_inline float4 operator-(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_sub_ps(a.m128, b.m128); -#else - return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); -#endif -} - -ccl_device_inline float4 operator+=(float4& a, const float4& b) -{ - return a = a + b; -} - -ccl_device_inline float4 operator*=(float4& a, const float4& b) -{ - return a = a * b; -} - -ccl_device_inline float4 operator/=(float4& a, float f) -{ - return a = a / f; -} - -ccl_device_inline int4 operator<(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */ -#else - return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); -#endif -} - -ccl_device_inline int4 operator>=(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */ -#else - return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); -#endif -} - -ccl_device_inline int4 operator<=(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */ -#else - return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); -#endif -} - -ccl_device_inline bool operator==(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; -#else - return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); -#endif -} - -ccl_device_inline float4 cross(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b)); -#else - return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f); -#endif -} - -ccl_device_inline bool is_zero(const float4& a) -{ -#ifdef __KERNEL_SSE__ - return a == make_float4(0.0f); -#else - return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); -#endif -} - -ccl_device_inline float reduce_add(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */ -#else - return ((a.x + a.y) + (a.z + a.w)); -#endif -} - -ccl_device_inline float average(const float4& a) -{ - return reduce_add(a) * 0.25f; -} - -ccl_device_inline float len(const float4& a) -{ - return sqrtf(dot(a, a)); -} - -ccl_device_inline float4 normalize(const float4& a) -{ - return a/len(a); -} - -ccl_device_inline float4 safe_normalize(const float4& a) -{ - float t = len(a); - return (t != 0.0f)? a/t: a; -} - -ccl_device_inline float4 min(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_min_ps(a.m128, b.m128); -#else - return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -#endif -} - -ccl_device_inline float4 max(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_max_ps(a.m128, b.m128); -#else - return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); -#endif -} - -#endif - -#ifndef __KERNEL_GPU__ - -ccl_device_inline float4 select(const int4& mask, const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */ -#else - return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); -#endif -} - -ccl_device_inline float4 reduce_min(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = min(shuffle<1,0,3,2>(a), a); - return min(shuffle<2,3,0,1>(h), h); -#else - return make_float4(min(min(a.x, a.y), min(a.z, a.w))); -#endif -} - -ccl_device_inline float4 reduce_max(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = max(shuffle<1,0,3,2>(a), a); - return max(shuffle<2,3,0,1>(h), h); -#else - return make_float4(max(max(a.x, a.y), max(a.z, a.w))); -#endif -} - -#if 0 -ccl_device_inline float4 reduce_add(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return shuffle<2,3,0,1>(h) + h; -#else - return make_float4((a.x + a.y) + (a.z + a.w)); -#endif -} -#endif - -ccl_device_inline void print_float4(const char *label, const float4& a) -{ - printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w); -} - -#endif - -/* Int2 */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline int2 operator+(const int2 &a, const int2 &b) -{ - return make_int2(a.x + b.x, a.y + b.y); -} - -ccl_device_inline int2 operator+=(int2 &a, const int2 &b) -{ - return a = a + b; -} - -ccl_device_inline int2 operator-(const int2 &a, const int2 &b) -{ - return make_int2(a.x - b.x, a.y - b.y); -} - -ccl_device_inline int2 operator*(const int2 &a, const int2 &b) -{ - return make_int2(a.x * b.x, a.y * b.y); -} - -ccl_device_inline int2 operator/(const int2 &a, const int2 &b) -{ - return make_int2(a.x / b.x, a.y / b.y); -} - -#endif - -/* Int3 */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline int3 min(int3 a, int3 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_min_epi32(a.m128, b.m128); -#else - return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif -} - -ccl_device_inline int3 max(int3 a, int3 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_max_epi32(a.m128, b.m128); -#else - return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif -} - -ccl_device_inline int3 clamp(const int3& a, int mn, int mx) -{ -#ifdef __KERNEL_SSE__ - return min(max(a, make_int3(mn)), make_int3(mx)); -#else - return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); -#endif -} - -ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx) -{ -#ifdef __KERNEL_SSE__ - return min(max(a, mn), make_int3(mx)); -#else - return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)); -#endif -} - -#endif - -#ifndef __KERNEL_GPU__ - -ccl_device_inline void print_int3(const char *label, const int3& a) -{ - printf("%s: %d %d %d\n", label, a.x, a.y, a.z); -} - -#endif - -/* Int4 */ - -#ifndef __KERNEL_GPU__ - -ccl_device_inline int4 operator+(const int4& a, const int4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_add_epi32(a.m128, b.m128); -#else - return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -#endif -} - -ccl_device_inline int4 operator+=(int4& a, const int4& b) -{ - return a = a + b; -} - -ccl_device_inline int4 operator>>(const int4& a, int i) -{ -#ifdef __KERNEL_SSE__ - return _mm_srai_epi32(a.m128, i); -#else - return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i); -#endif -} - -ccl_device_inline int4 min(int4 a, int4 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_min_epi32(a.m128, b.m128); -#else - return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -#endif -} - -ccl_device_inline int4 max(int4 a, int4 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_max_epi32(a.m128, b.m128); -#else - return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); -#endif -} - -ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx) -{ - return min(max(a, mn), mx); -} - -ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b) -{ -#ifdef __KERNEL_SSE__ - __m128 m = _mm_cvtepi32_ps(mask); - return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */ -#else - return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); -#endif -} +CCL_NAMESPACE_END -ccl_device_inline void print_int4(const char *label, const int4& a) -{ - printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); -} +#include "util/util_math_int2.h" +#include "util/util_math_int3.h" +#include "util/util_math_int4.h" -#endif +#include "util/util_math_float2.h" +#include "util/util_math_float3.h" +#include "util/util_math_float4.h" -/* Int/Float conversion */ +CCL_NAMESPACE_BEGIN #ifndef __KERNEL_OPENCL__ - -ccl_device_inline int as_int(uint i) -{ - union { uint ui; int i; } u; - u.ui = i; - return u.i; -} - -ccl_device_inline uint as_uint(int i) -{ - union { uint ui; int i; } u; - u.i = i; - return u.ui; -} - -ccl_device_inline uint as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline int __float_as_int(float f) -{ - union { int i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __int_as_float(int i) -{ - union { int i; float f; } u; - u.i = i; - return u.f; -} - -ccl_device_inline uint __float_as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __uint_as_float(uint i) -{ - union { uint i; float f; } u; - u.i = i; - return u.f; -} - - /* Interpolation */ template<class A, class B> A lerp(const A& a, const A& b, const B& t) @@ -1253,26 +331,13 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t) /* Triangle */ -ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3) +ccl_device_inline float triangle_area(const float3& v1, + const float3& v2, + const float3& v3) { return len(cross(v3 - v2, v1 - v2))*0.5f; } - -#endif - -/* Versions of functions which are safe for fast math. */ -ccl_device_inline bool isnan_safe(float f) -{ - unsigned int x = __float_as_uint(f); - return (x << 1) > 0xff000000u; -} - -ccl_device_inline bool isfinite_safe(float f) -{ - /* By IEEE 754 rule, 2*Inf equals Inf */ - unsigned int x = __float_as_uint(f); - return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); -} +#endif /* __KERNEL_OPENCL__ */ /* Orthonormal vectors */ @@ -1369,16 +434,16 @@ ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle) float3 r; r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) + - (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) + - (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z); + (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) + + (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z); r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) + - ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) + - (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z); + ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) + + (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z); r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) + - (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) + - ((costheta + (1 - costheta) * axis.z * axis.z) * p.z); + (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) + + ((costheta + (1 - costheta) * axis.z * axis.z) * p.z); return r; } @@ -1427,17 +492,17 @@ ccl_device float safe_powf(float a, float b) return compatible_powf(a, b); } -ccl_device float safe_logf(float a, float b) +ccl_device float safe_divide(float a, float b) { - if(UNLIKELY(a < 0.0f || b < 0.0f)) - return 0.0f; - - return logf(a)/logf(b); + return (b != 0.0f)? a/b: 0.0f; } -ccl_device float safe_divide(float a, float b) +ccl_device float safe_logf(float a, float b) { - return (b != 0.0f)? a/b: 0.0f; + if(UNLIKELY(a <= 0.0f || b <= 0.0f)) + return 0.0f; + + return safe_divide(logf(a),logf(b)); } ccl_device float safe_modulo(float a, float b) @@ -1493,31 +558,6 @@ ccl_device_inline float2 map_to_sphere(const float3 co) return make_float2(u, v); } -ccl_device_inline int util_max_axis(float3 vec) -{ -#ifdef __KERNEL_SSE__ - __m128 a = shuffle<0,0,1,1>(vec.m128); - __m128 b = shuffle<1,2,2,1>(vec.m128); - __m128 c = _mm_cmpgt_ps(a, b); - int mask = _mm_movemask_ps(c) & 0x7; - static const char tab[8] = {2, 2, 2, 0, 1, 2, 1, 0}; - return tab[mask]; -#else - if(vec.x > vec.y) { - if(vec.x > vec.z) - return 0; - else - return 2; - } - else { - if(vec.y > vec.z) - return 1; - else - return 2; - } -#endif -} - CCL_NAMESPACE_END #endif /* __UTIL_MATH_H__ */ diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h new file mode 100644 index 00000000000..6f9d0855d50 --- /dev/null +++ b/intern/cycles/util/util_math_float2.h @@ -0,0 +1,227 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_FLOAT2_H__ +#define __UTIL_MATH_FLOAT2_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float2 operator-(const float2& a); +ccl_device_inline float2 operator*(const float2& a, const float2& b); +ccl_device_inline float2 operator*(const float2& a, float f); +ccl_device_inline float2 operator*(float f, const float2& a); +ccl_device_inline float2 operator/(float f, const float2& a); +ccl_device_inline float2 operator/(const float2& a, float f); +ccl_device_inline float2 operator/(const float2& a, const float2& b); +ccl_device_inline float2 operator+(const float2& a, const float2& b); +ccl_device_inline float2 operator-(const float2& a, const float2& b); +ccl_device_inline float2 operator+=(float2& a, const float2& b); +ccl_device_inline float2 operator*=(float2& a, const float2& b); +ccl_device_inline float2 operator*=(float2& a, float f); +ccl_device_inline float2 operator/=(float2& a, const float2& b); +ccl_device_inline float2 operator/=(float2& a, float f); + +ccl_device_inline bool operator==(const float2& a, const float2& b); +ccl_device_inline bool operator!=(const float2& a, const float2& b); + +ccl_device_inline bool is_zero(const float2& a); +ccl_device_inline float average(const float2& a); +ccl_device_inline float dot(const float2& a, const float2& b); +ccl_device_inline float cross(const float2& a, const float2& b); +ccl_device_inline float len(const float2& a); +ccl_device_inline float2 normalize(const float2& a); +ccl_device_inline float2 normalize_len(const float2& a, float *t); +ccl_device_inline float2 safe_normalize(const float2& a); +ccl_device_inline float2 min(const float2& a, const float2& b); +ccl_device_inline float2 max(const float2& a, const float2& b); +ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx); +ccl_device_inline float2 fabs(const float2& a); +ccl_device_inline float2 as_float2(const float4& a); +ccl_device_inline float2 interp(const float2& a, const float2& b, float t); +#endif /* !__KERNEL_OPENCL__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float2 operator-(const float2& a) +{ + return make_float2(-a.x, -a.y); +} + +ccl_device_inline float2 operator*(const float2& a, const float2& b) +{ + return make_float2(a.x*b.x, a.y*b.y); +} + +ccl_device_inline float2 operator*(const float2& a, float f) +{ + return make_float2(a.x*f, a.y*f); +} + +ccl_device_inline float2 operator*(float f, const float2& a) +{ + return make_float2(a.x*f, a.y*f); +} + +ccl_device_inline float2 operator/(float f, const float2& a) +{ + return make_float2(f/a.x, f/a.y); +} + +ccl_device_inline float2 operator/(const float2& a, float f) +{ + float invf = 1.0f/f; + return make_float2(a.x*invf, a.y*invf); +} + +ccl_device_inline float2 operator/(const float2& a, const float2& b) +{ + return make_float2(a.x/b.x, a.y/b.y); +} + +ccl_device_inline float2 operator+(const float2& a, const float2& b) +{ + return make_float2(a.x+b.x, a.y+b.y); +} + +ccl_device_inline float2 operator-(const float2& a, const float2& b) +{ + return make_float2(a.x-b.x, a.y-b.y); +} + +ccl_device_inline float2 operator+=(float2& a, const float2& b) +{ + return a = a + b; +} + +ccl_device_inline float2 operator*=(float2& a, const float2& b) +{ + return a = a * b; +} + +ccl_device_inline float2 operator*=(float2& a, float f) +{ + return a = a * f; +} + +ccl_device_inline float2 operator/=(float2& a, const float2& b) +{ + return a = a / b; +} + +ccl_device_inline float2 operator/=(float2& a, float f) +{ + float invf = 1.0f/f; + return a = a * invf; +} + +ccl_device_inline bool operator==(const float2& a, const float2& b) +{ + return (a.x == b.x && a.y == b.y); +} + +ccl_device_inline bool operator!=(const float2& a, const float2& b) +{ + return !(a == b); +} + +ccl_device_inline bool is_zero(const float2& a) +{ + return (a.x == 0.0f && a.y == 0.0f); +} + +ccl_device_inline float average(const float2& a) +{ + return (a.x + a.y)*(1.0f/2.0f); +} + +ccl_device_inline float dot(const float2& a, const float2& b) +{ + return a.x*b.x + a.y*b.y; +} + +ccl_device_inline float cross(const float2& a, const float2& b) +{ + return (a.x*b.y - a.y*b.x); +} + +ccl_device_inline float len(const float2& a) +{ + return sqrtf(dot(a, a)); +} + +ccl_device_inline float2 normalize(const float2& a) +{ + return a/len(a); +} + +ccl_device_inline float2 normalize_len(const float2& a, float *t) +{ + *t = len(a); + return a/(*t); +} + +ccl_device_inline float2 safe_normalize(const float2& a) +{ + float t = len(a); + return (t != 0.0f)? a/t: a; +} + +ccl_device_inline float2 min(const float2& a, const float2& b) +{ + return make_float2(min(a.x, b.x), min(a.y, b.y)); +} + +ccl_device_inline float2 max(const float2& a, const float2& b) +{ + return make_float2(max(a.x, b.x), max(a.y, b.y)); +} + +ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline float2 fabs(const float2& a) +{ + return make_float2(fabsf(a.x), fabsf(a.y)); +} + +ccl_device_inline float2 as_float2(const float4& a) +{ + return make_float2(a.x, a.y); +} + +ccl_device_inline float2 interp(const float2& a, const float2& b, float t) +{ + return a + t*(b - a); +} +#endif /* !__KERNEL_OPENCL__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT2_H__ */ diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h new file mode 100644 index 00000000000..bb04c4aa2d9 --- /dev/null +++ b/intern/cycles/util/util_math_float3.h @@ -0,0 +1,385 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_FLOAT3_H__ +#define __UTIL_MATH_FLOAT3_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float3 operator-(const float3& a); +ccl_device_inline float3 operator*(const float3& a, const float3& b); +ccl_device_inline float3 operator*(const float3& a, const float f); +ccl_device_inline float3 operator*(const float f, const float3& a); +ccl_device_inline float3 operator/(const float f, const float3& a); +ccl_device_inline float3 operator/(const float3& a, const float f); +ccl_device_inline float3 operator/(const float3& a, const float3& b); +ccl_device_inline float3 operator+(const float3& a, const float3& b); +ccl_device_inline float3 operator-(const float3& a, const float3& b); +ccl_device_inline float3 operator+=(float3& a, const float3& b); +ccl_device_inline float3 operator-=(float3& a, const float3& b); +ccl_device_inline float3 operator*=(float3& a, const float3& b); +ccl_device_inline float3 operator*=(float3& a, float f); +ccl_device_inline float3 operator/=(float3& a, const float3& b); +ccl_device_inline float3 operator/=(float3& a, float f); + +ccl_device_inline bool operator==(const float3& a, const float3& b); +ccl_device_inline bool operator!=(const float3& a, const float3& b); + +ccl_device_inline float dot(const float3& a, const float3& b); +ccl_device_inline float dot_xy(const float3& a, const float3& b); +ccl_device_inline float3 cross(const float3& a, const float3& b); +ccl_device_inline float3 normalize(const float3& a); +ccl_device_inline float3 min(const float3& a, const float3& b); +ccl_device_inline float3 max(const float3& a, const float3& b); +ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx); +ccl_device_inline float3 fabs(const float3& a); +ccl_device_inline float3 mix(const float3& a, const float3& b, float t); +ccl_device_inline float3 rcp(const float3& a); +#endif /* !__KERNEL_OPENCL__ */ + +ccl_device_inline float max3(float3 a); +ccl_device_inline float len(const float3 a); +ccl_device_inline float len_squared(const float3 a); + +ccl_device_inline float3 saturate3(float3 a); +ccl_device_inline float3 safe_normalize(const float3 a); +ccl_device_inline float3 normalize_len(const float3 a, float *t);; +ccl_device_inline float3 safe_normalize_len(const float3 a, float *t); +ccl_device_inline float3 interp(float3 a, float3 b, float t); + +ccl_device_inline bool is_zero(const float3 a); +ccl_device_inline float reduce_add(const float3 a); +ccl_device_inline float average(const float3 a); +ccl_device_inline bool isequal_float3(const float3 a, const float3 b); + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float3 operator-(const float3& a) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#else + return make_float3(-a.x, -a.y, -a.z); +#endif +} + +ccl_device_inline float3 operator*(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,b.m128)); +#else + return make_float3(a.x*b.x, a.y*b.y, a.z*b.z); +#endif +} + +ccl_device_inline float3 operator*(const float3& a, const float f) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f))); +#else + return make_float3(a.x*f, a.y*f, a.z*f); +#endif +} + +ccl_device_inline float3 operator*(const float f, const float3& a) +{ + /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ +#if defined(__KERNEL_SSE__) && 0 + return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); +#else + return make_float3(a.x*f, a.y*f, a.z*f); +#endif +} + +ccl_device_inline float3 operator/(const float f, const float3& a) +{ + /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ +#if defined(__KERNEL_SSE__) && 0 + __m128 rc = _mm_rcp_ps(a.m128); + return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); +#else + return make_float3(f / a.x, f / a.y, f / a.z); +#endif +} + +ccl_device_inline float3 operator/(const float3& a, const float f) +{ + float invf = 1.0f/f; + return a * invf; +} + +ccl_device_inline float3 operator/(const float3& a, const float3& b) +{ + /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ +#if defined(__KERNEL_SSE__) && 0 + __m128 rc = _mm_rcp_ps(b.m128); + return float3(_mm_mul_ps(a, rc)); +#else + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +#endif +} + +ccl_device_inline float3 operator+(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_add_ps(a.m128, b.m128)); +#else + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +#endif +} + +ccl_device_inline float3 operator-(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_sub_ps(a.m128, b.m128)); +#else + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +#endif +} + +ccl_device_inline float3 operator+=(float3& a, const float3& b) +{ + return a = a + b; +} + +ccl_device_inline float3 operator-=(float3& a, const float3& b) +{ + return a = a - b; +} + +ccl_device_inline float3 operator*=(float3& a, const float3& b) +{ + return a = a * b; +} + +ccl_device_inline float3 operator*=(float3& a, float f) +{ + return a = a * f; +} + +ccl_device_inline float3 operator/=(float3& a, const float3& b) +{ + return a = a / b; +} + +ccl_device_inline float3 operator/=(float3& a, float f) +{ + float invf = 1.0f/f; + return a = a * invf; +} + +ccl_device_inline bool operator==(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; +#else + return (a.x == b.x && a.y == b.y && a.z == b.z); +#endif +} + +ccl_device_inline bool operator!=(const float3& a, const float3& b) +{ + return !(a == b); +} + +ccl_device_inline float dot(const float3& a, const float3& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); +#else + return a.x*b.x + a.y*b.y + a.z*b.z; +#endif +} + +ccl_device_inline float dot_xy(const float3& a, const float3& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b)); +#else + return a.x*b.x + a.y*b.y; +#endif +} + +ccl_device_inline float3 cross(const float3& a, const float3& b) +{ + float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); + return r; +} + +ccl_device_inline float3 normalize(const float3& a) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); + return float3(_mm_div_ps(a.m128, norm)); +#else + return a/len(a); +#endif +} + +ccl_device_inline float3 min(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_min_ps(a.m128, b.m128)); +#else + return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); +#endif +} + +ccl_device_inline float3 max(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_max_ps(a.m128, b.m128)); +#else + return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); +#endif +} + +ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline float3 fabs(const float3& a) +{ +#ifdef __KERNEL_SSE__ + __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return float3(_mm_and_ps(a.m128, mask)); +#else + return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); +#endif +} + +ccl_device_inline float3 mix(const float3& a, const float3& b, float t) +{ + return a + t*(b - a); +} + +ccl_device_inline float3 rcp(const float3& a) +{ +#ifdef __KERNEL_SSE__ + const float4 r(_mm_rcp_ps(a.m128)); + return float3(_mm_sub_ps(_mm_add_ps(r, r), + _mm_mul_ps(_mm_mul_ps(r, r), a))); +#else + return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); +#endif +} +#endif /* !__KERNEL_OPENCL__ */ + +ccl_device_inline float max3(float3 a) +{ + return max(max(a.x, a.y), a.z); +} + +ccl_device_inline float len(const float3 a) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F))); +#else + return sqrtf(dot(a, a)); +#endif +} + +ccl_device_inline float len_squared(const float3 a) +{ + return dot(a, a); +} + +ccl_device_inline float3 saturate3(float3 a) +{ + return make_float3(saturate(a.x), saturate(a.y), saturate(a.z)); +} + +ccl_device_inline float3 normalize_len(const float3 a, float *t) +{ + *t = len(a); + float x = 1.0f / *t; + return a*x; +} + +ccl_device_inline float3 safe_normalize(const float3 a) +{ + float t = len(a); + return (t != 0.0f)? a * (1.0f/t) : a; +} + +ccl_device_inline float3 safe_normalize_len(const float3 a, float *t) +{ + *t = len(a); + return (*t != 0.0f)? a/(*t): a; +} + +ccl_device_inline float3 interp(float3 a, float3 b, float t) +{ + return a + t*(b - a); +} + +ccl_device_inline bool is_zero(const float3 a) +{ +#ifdef __KERNEL_SSE__ + return a == make_float3(0.0f); +#else + return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f); +#endif +} + +ccl_device_inline float reduce_add(const float3 a) +{ + return (a.x + a.y + a.z); +} + +ccl_device_inline float average(const float3 a) +{ + return reduce_add(a)*(1.0f/3.0f); +} + +ccl_device_inline bool isequal_float3(const float3 a, const float3 b) +{ +#ifdef __KERNEL_OPENCL__ + return all(a == b); +#else + return a == b; +#endif +} + +ccl_device_inline bool isfinite3_safe(float3 v) +{ + return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z); +} + +ccl_device_inline float3 ensure_finite3(float3 v) +{ + if(!isfinite_safe(v.x)) v.x = 0.0f; + if(!isfinite_safe(v.y)) v.y = 0.0f; + if(!isfinite_safe(v.z)) v.z = 0.0f; + return v; +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT3_H__ */ diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h new file mode 100644 index 00000000000..d89121b3a1d --- /dev/null +++ b/intern/cycles/util/util_math_float4.h @@ -0,0 +1,393 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_FLOAT4_H__ +#define __UTIL_MATH_FLOAT4_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float4 operator-(const float4& a); +ccl_device_inline float4 operator*(const float4& a, const float4& b); +ccl_device_inline float4 operator*(const float4& a, float f); +ccl_device_inline float4 operator*(float f, const float4& a); +ccl_device_inline float4 operator/(const float4& a, float f); +ccl_device_inline float4 operator/(const float4& a, const float4& b); +ccl_device_inline float4 operator+(const float4& a, const float4& b); +ccl_device_inline float4 operator-(const float4& a, const float4& b); +ccl_device_inline float4 operator+=(float4& a, const float4& b); +ccl_device_inline float4 operator*=(float4& a, const float4& b); +ccl_device_inline float4 operator/=(float4& a, float f); + +ccl_device_inline int4 operator<(const float4& a, const float4& b); +ccl_device_inline int4 operator>=(const float4& a, const float4& b); +ccl_device_inline int4 operator<=(const float4& a, const float4& b); +ccl_device_inline bool operator==(const float4& a, const float4& b); + +ccl_device_inline float dot(const float4& a, const float4& b); +ccl_device_inline float len_squared(const float4& a); +ccl_device_inline float4 rcp(const float4& a); +ccl_device_inline float4 cross(const float4& a, const float4& b); +ccl_device_inline bool is_zero(const float4& a); +ccl_device_inline float reduce_add(const float4& a); +ccl_device_inline float average(const float4& a); +ccl_device_inline float len(const float4& a); +ccl_device_inline float4 normalize(const float4& a); +ccl_device_inline float4 safe_normalize(const float4& a); +ccl_device_inline float4 min(const float4& a, const float4& b); +ccl_device_inline float4 max(const float4& a, const float4& b); +#endif /* !__KERNEL_OPENCL__*/ + +#ifdef __KERNEL_SSE__ +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& b); + +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b); + +# ifdef __KERNEL_SSE3__ +template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b); +template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b); +# endif +#endif /* __KERNEL_SSE__ */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline float4 select(const int4& mask, + const float4& a, + const float4& b); +ccl_device_inline float4 reduce_min(const float4& a); +ccl_device_inline float4 reduce_max(const float4& a); +# if 0 +ccl_device_inline float4 reduce_add(const float4& a); +# endif +#endif /* !__KERNEL_GPU__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float4 operator-(const float4& a) +{ +#ifdef __KERNEL_SSE__ + __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return float4(_mm_xor_ps(a.m128, mask)); +#else + return make_float4(-a.x, -a.y, -a.z, -a.w); +#endif +} + +ccl_device_inline float4 operator*(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_mul_ps(a.m128, b.m128)); +#else + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +#endif +} + +ccl_device_inline float4 operator*(const float4& a, float f) +{ +#if defined(__KERNEL_SSE__) + return a * make_float4(f); +#else + return make_float4(a.x*f, a.y*f, a.z*f, a.w*f); +#endif +} + +ccl_device_inline float4 operator*(float f, const float4& a) +{ + return a * f; +} + +ccl_device_inline float4 operator/(const float4& a, float f) +{ + return a * (1.0f/f); +} + +ccl_device_inline float4 operator/(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return a * rcp(b); +#else + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +#endif + +} + +ccl_device_inline float4 operator+(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_add_ps(a.m128, b.m128)); +#else + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +#endif +} + +ccl_device_inline float4 operator-(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_sub_ps(a.m128, b.m128)); +#else + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +#endif +} + +ccl_device_inline float4 operator+=(float4& a, const float4& b) +{ + return a = a + b; +} + +ccl_device_inline float4 operator*=(float4& a, const float4& b) +{ + return a = a * b; +} + +ccl_device_inline float4 operator/=(float4& a, float f) +{ + return a = a / f; +} + +ccl_device_inline int4 operator<(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + /* TODO(sergey): avoid cvt. */ + return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128))); +#else + return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); +#endif +} + +ccl_device_inline int4 operator>=(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + /* TODO(sergey): avoid cvt. */ + return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128))); +#else + return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); +#endif +} + +ccl_device_inline int4 operator<=(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + /* TODO(sergey): avoid cvt. */ + return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128))); +#else + return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); +#endif +} + +ccl_device_inline bool operator==(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; +#else + return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); +#endif +} + +ccl_device_inline float dot(const float4& a, const float4& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); +#else + return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w); +#endif +} + +ccl_device_inline float len_squared(const float4& a) +{ + return dot(a, a); +} + +ccl_device_inline float4 rcp(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 r(_mm_rcp_ps(a.m128)); + return float4(_mm_sub_ps(_mm_add_ps(r, r), + _mm_mul_ps(_mm_mul_ps(r, r), a))); +#else + return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); +#endif +} + +ccl_device_inline float4 cross(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - + (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b)); +#else + return make_float4(a.y*b.z - a.z*b.y, + a.z*b.x - a.x*b.z, + a.x*b.y - a.y*b.x, + 0.0f); +#endif +} + +ccl_device_inline bool is_zero(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return a == make_float4(0.0f); +#else + return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); +#endif +} + +ccl_device_inline float reduce_add(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h(shuffle<1,0,3,2>(a) + a); + /* TODO(sergey): Investigate efficiency. */ + return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); +#else + return ((a.x + a.y) + (a.z + a.w)); +#endif +} + +ccl_device_inline float average(const float4& a) +{ + return reduce_add(a) * 0.25f; +} + +ccl_device_inline float len(const float4& a) +{ + return sqrtf(dot(a, a)); +} + +ccl_device_inline float4 normalize(const float4& a) +{ + return a/len(a); +} + +ccl_device_inline float4 safe_normalize(const float4& a) +{ + float t = len(a); + return (t != 0.0f)? a/t: a; +} + +ccl_device_inline float4 min(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_min_ps(a.m128, b.m128)); +#else + return make_float4(min(a.x, b.x), + min(a.y, b.y), + min(a.z, b.z), + min(a.w, b.w)); +#endif +} + +ccl_device_inline float4 max(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_max_ps(a.m128, b.m128)); +#else + return make_float4(max(a.x, b.x), + max(a.y, b.y), + max(a.z, b.z), + max(a.w, b.w)); +#endif +} +#endif /* !__KERNEL_OPENCL__*/ + +#ifdef __KERNEL_SSE__ +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& b) +{ + return float4(_mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(b), + _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); +} + +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) +{ + return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); +} + +# ifdef __KERNEL_SSE3__ +template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) +{ + return float4(_mm_moveldup_ps(b)); +} + +template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b) +{ + return float4(_mm_movehdup_ps(b)); +} +# endif /* __KERNEL_SSE3__ */ +#endif /* __KERNEL_SSE__ */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline float4 select(const int4& mask, + const float4& a, + const float4& b) +{ +#ifdef __KERNEL_SSE__ + /* TODO(sergey): avoid cvt. */ + return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), + _mm_andnot_ps(_mm_cvtepi32_ps(mask), b))); +#else + return make_float4((mask.x)? a.x: b.x, + (mask.y)? a.y: b.y, + (mask.z)? a.z: b.z, + (mask.w)? a.w: b.w); +#endif +} + +ccl_device_inline float4 reduce_min(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = min(shuffle<1,0,3,2>(a), a); + return min(shuffle<2,3,0,1>(h), h); +#else + return make_float4(min(min(a.x, a.y), min(a.z, a.w))); +#endif +} + +ccl_device_inline float4 reduce_max(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = max(shuffle<1,0,3,2>(a), a); + return max(shuffle<2,3,0,1>(h), h); +#else + return make_float4(max(max(a.x, a.y), max(a.z, a.w))); +#endif +} + +#if 0 +ccl_device_inline float4 reduce_add(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = shuffle<1,0,3,2>(a) + a; + return shuffle<2,3,0,1>(h) + h; +#else + return make_float4((a.x + a.y) + (a.z + a.w)); +#endif +} +#endif +#endif /* !__KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT4_H__ */ diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h new file mode 100644 index 00000000000..828c49a131c --- /dev/null +++ b/intern/cycles/util/util_math_int2.h @@ -0,0 +1,77 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INT2_H__ +#define __UTIL_MATH_INT2_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline bool operator==(const int2 a, const int2 b); +ccl_device_inline int2 operator+(const int2 &a, const int2 &b); +ccl_device_inline int2 operator+=(int2 &a, const int2 &b); +ccl_device_inline int2 operator-(const int2 &a, const int2 &b); +ccl_device_inline int2 operator*(const int2 &a, const int2 &b); +ccl_device_inline int2 operator/(const int2 &a, const int2 &b); +#endif /* !__KERNEL_OPENCL__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline bool operator==(const int2 a, const int2 b) +{ + return (a.x == b.x && a.y == b.y); +} + +ccl_device_inline int2 operator+(const int2 &a, const int2 &b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} + +ccl_device_inline int2 operator+=(int2 &a, const int2 &b) +{ + return a = a + b; +} + +ccl_device_inline int2 operator-(const int2 &a, const int2 &b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} + +ccl_device_inline int2 operator*(const int2 &a, const int2 &b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} + +ccl_device_inline int2 operator/(const int2 &a, const int2 &b) +{ + return make_int2(a.x / b.x, a.y / b.y); +} +#endif /* !__KERNEL_OPENCL__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT2_H__ */ diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h new file mode 100644 index 00000000000..fa7a02636de --- /dev/null +++ b/intern/cycles/util/util_math_int3.h @@ -0,0 +1,83 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INT3_H__ +#define __UTIL_MATH_INT3_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline int3 min(int3 a, int3 b); +ccl_device_inline int3 max(int3 a, int3 b); +ccl_device_inline int3 clamp(const int3& a, int mn, int mx); +ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx); +#endif /* !__KERNEL_OPENCL__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline int3 min(int3 a, int3 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int3(_mm_min_epi32(a.m128, b.m128)); +#else + return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); +#endif +} + +ccl_device_inline int3 max(int3 a, int3 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int3(_mm_max_epi32(a.m128, b.m128)); +#else + return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); +#endif +} + +ccl_device_inline int3 clamp(const int3& a, int mn, int mx) +{ +#ifdef __KERNEL_SSE__ + return min(max(a, make_int3(mn)), make_int3(mx)); +#else + return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); +#endif +} + +ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx) +{ +#ifdef __KERNEL_SSE__ + return min(max(a, mn), make_int3(mx)); +#else + return make_int3(clamp(a.x, mn.x, mx), + clamp(a.y, mn.y, mx), + clamp(a.z, mn.z, mx)); +#endif +} +#endif /* !__KERNEL_OPENCL__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT3_H__ */ diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h new file mode 100644 index 00000000000..79a8c0841e7 --- /dev/null +++ b/intern/cycles/util/util_math_int4.h @@ -0,0 +1,119 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INT4_H__ +#define __UTIL_MATH_INT4_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline int4 operator+(const int4& a, const int4& b); +ccl_device_inline int4 operator+=(int4& a, const int4& b); +ccl_device_inline int4 operator>>(const int4& a, int i); +ccl_device_inline int4 min(int4 a, int4 b); +ccl_device_inline int4 max(int4 a, int4 b); +ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx); +ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b); +#endif /* __KERNEL_GPU__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline int4 operator+(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_add_epi32(a.m128, b.m128)); +#else + return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +#endif +} + +ccl_device_inline int4 operator+=(int4& a, const int4& b) +{ + return a = a + b; +} + +ccl_device_inline int4 operator>>(const int4& a, int i) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_srai_epi32(a.m128, i)); +#else + return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i); +#endif +} + +ccl_device_inline int4 min(int4 a, int4 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int4(_mm_min_epi32(a.m128, b.m128)); +#else + return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); +#endif +} + +ccl_device_inline int4 max(int4 a, int4 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int4(_mm_max_epi32(a.m128, b.m128)); +#else + return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); +#endif +} + +ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + const __m128 m = _mm_cvtepi32_ps(mask); + /* TODO(sergey): avoid cvt. */ + return int4(_mm_castps_si128( + _mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), + _mm_andnot_ps(m, _mm_castsi128_ps(b))))); +#else + return make_int4((mask.x)? a.x: b.x, + (mask.y)? a.y: b.y, + (mask.z)? a.z: b.z, + (mask.w)? a.w: b.w); +#endif +} + +ccl_device_inline int4 load_int4(const int *v) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_loadu_si128((__m128i*)v)); +#else + return make_int4(v[0], v[1], v[2], v[3]); +#endif +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT4_H__ */ diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h new file mode 100644 index 00000000000..c7511f8306e --- /dev/null +++ b/intern/cycles/util/util_math_matrix.h @@ -0,0 +1,404 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_MATRIX_H__ +#define __UTIL_MATH_MATRIX_H__ + +CCL_NAMESPACE_BEGIN + +#define MAT(A, size, row, col) A[(row)*(size)+(col)] + +/* Variants that use a constant stride on GPUS. */ +#ifdef __KERNEL_GPU__ +# define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)] +/* Element access when only the lower-triangular elements are stored. */ +# define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)] +# define VECS(V, i, s) V[(i)*(s)] +#else +# define MATS(A, n, r, c, s) MAT(A, n, r, c) +# define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)] +# define VECS(V, i, s) V[i] +#endif + +/* Zeroing helpers. */ + +ccl_device_inline void math_vector_zero(float *v, int n) +{ + for(int i = 0; i < n; i++) { + v[i] = 0.0f; + } +} + +ccl_device_inline void math_matrix_zero(float *A, int n) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) = 0.0f; + } + } +} + +/* Elementary vector operations. */ + +ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] += b[i]; + } +} + +ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] *= b[i]; + } +} + +ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n) +{ + for(int i = 0; i < n; i++) { + a[i*astride] *= b[i]; + } +} + +ccl_device_inline void math_vector_scale(float *a, float b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] *= b; + } +} + +ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] = max(a[i], b[i]); + } +} + +ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w) +{ + for(int i = 0; i < n; i++) { + v[i] += w*x[i]; + } +} + +ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride) +{ + for(int i = 0; i < n; i++) { + v[i*stride] += w*x[i]; + } +} + +/* Elementary matrix operations. + * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */ + +ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride) +{ + for(int row = 0; row < n; row++) { + MATHS(A, row, row, stride) += val; + } +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_matrix_add_gramian(float *A, + int n, + const float *ccl_restrict v, + float weight) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) += v[row]*v[col]*weight; + } + } +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A, + int n, + const float *ccl_restrict v, + float weight, + int stride) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MATHS(A, row, col, stride) += v[row]*v[col]*weight; + } + } +} + +/* Transpose matrix A inplace. */ +ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride) +{ + for(int i = 0; i < n; i++) { + for(int j = 0; j < i; j++) { + float temp = MATS(A, n, i, j, stride); + MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride); + MATS(A, n, j, i, stride) = temp; + } + } +} + +/* Solvers for matrix problems */ + +/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A + * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L. + * Also, only the lower triangular part of A is ever accessed. */ +ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + float sum_col = MATHS(A, row, col, stride); + for(int k = 0; k < col; k++) { + sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride); + } + if(row == col) { + sum_col = sqrtf(max(sum_col, 0.0f)); + } + else { + sum_col /= MATHS(A, col, col, stride); + } + MATHS(A, row, col, stride) = sum_col; + } + } +} + +/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process. + * + * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A. + * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S. + * Since L is lower triangular, finding b is relatively easy since y is known. + * Then, the remaining problem is Lt*S = b, which again can be solved easily. + * + * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is + * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */ +ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride) +{ + /* Since the first entry of the design row is always 1, the upper-left element of XtWX is a good + * heuristic for the amount of pixels considered (with weighting), therefore the amount of correction + * is scaled based on it. */ + math_trimatrix_add_diagonal(A, n, 3e-7f*A[0], stride); /* Improve the numerical stability. */ + math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */ + + /* Use forward substitution to solve L*b = y, replacing y by b. */ + for(int row = 0; row < n; row++) { + float3 sum = VECS(y, row, stride); + for(int col = 0; col < row; col++) + sum -= MATHS(A, row, col, stride) * VECS(y, col, stride); + VECS(y, row, stride) = sum / MATHS(A, row, row, stride); + } + + /* Use backward substitution to solve Lt*S = b, replacing b by S. */ + for(int row = n-1; row >= 0; row--) { + float3 sum = VECS(y, row, stride); + for(int col = row+1; col < n; col++) + sum -= MATHS(A, col, row, stride) * VECS(y, col, stride); + VECS(y, row, stride) = sum / MATHS(A, row, row, stride); + } +} + +/* Perform the Jacobi Eigenvalue Methon on matrix A. + * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed. + * The algorithm overwrites the contents of A. + * + * After returning, A will be overwritten with D, which is (almost) diagonal, + * and V will contain the eigenvectors of the original A in its rows (!), + * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A. + */ +ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float *V, int n, int v_stride) +{ + const float singular_epsilon = 1e-9f; + + for (int row = 0; row < n; row++) { + for (int col = 0; col < n; col++) { + MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f; + } + } + + for (int sweep = 0; sweep < 8; sweep++) { + float off_diagonal = 0.0f; + for (int row = 1; row < n; row++) { + for (int col = 0; col < row; col++) { + off_diagonal += fabsf(MAT(A, n, row, col)); + } + } + if (off_diagonal < 1e-7f) { + /* The matrix has nearly reached diagonal form. + * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */ + break; + } + + /* Set the threshold for the small element rotation skip in the first sweep: + * Skip all elements that are less than a tenth of the average off-diagonal element. */ + float threshold = 0.2f*off_diagonal / (n*n); + + for(int row = 1; row < n; row++) { + for(int col = 0; col < row; col++) { + /* Perform a Jacobi rotation on this element that reduces it to zero. */ + float element = MAT(A, n, row, col); + float abs_element = fabsf(element); + + /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */ + if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { + MAT(A, n, row, col) = 0.0f; + continue; + } + + if(element == 0.0f) { + continue; + } + + /* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */ + if(sweep < 3 && (abs_element < threshold)) { + continue; + } + + /* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi). + * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case. + * Then, we compute sin(phi) and cos(phi) themselves. */ + float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col); + float ratio; + if (abs_element > singular_epsilon*fabsf(singular_diff)) { + float cot_2phi = 0.5f*singular_diff / element; + ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi)); + if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ + } + else { + ratio = element / singular_diff; + } + + float c = 1.0f / sqrtf(1.0f + ratio*ratio); + float s = ratio*c; + /* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */ + float tan_phi_2 = s / (1.0f + c); + + /* Update the singular values in the diagonal. */ + float singular_delta = ratio*element; + MAT(A, n, row, row) += singular_delta; + MAT(A, n, col, col) -= singular_delta; + + /* Set the element itself to zero. */ + MAT(A, n, row, col) = 0.0f; + + /* Perform the actual rotations on the matrices. */ +#define ROT(M, r1, c1, r2, c2, stride) \ + { \ + float M1 = MATS(M, n, r1, c1, stride); \ + float M2 = MATS(M, n, r2, c2, stride); \ + MATS(M, n, r1, c1, stride) -= s*(M2 + tan_phi_2*M1); \ + MATS(M, n, r2, c2, stride) += s*(M1 - tan_phi_2*M2); \ + } + + /* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */ + for(int i = 0 ; i < col; i++) ROT(A, col, i, row, i, 1); + for(int i = col+1; i < row; i++) ROT(A, i, col, row, i, 1); + for(int i = row+1; i < n ; i++) ROT(A, i, col, i, row, 1); + + for(int i = 0 ; i < n ; i++) ROT(V, col, i, row, i, v_stride); +#undef ROT + } + } + } + + /* Sort eigenvalues and the associated eigenvectors. */ + for (int i = 0; i < n - 1; i++) { + float v = MAT(A, n, i, i); + int k = i; + for (int j = i; j < n; j++) { + if (MAT(A, n, j, j) >= v) { + v = MAT(A, n, j, j); + k = j; + } + } + if (k != i) { + /* Swap eigenvalues. */ + MAT(A, n, k, k) = MAT(A, n, i, i); + MAT(A, n, i, i) = v; + /* Swap eigenvectors. */ + for (int j = 0; j < n; j++) { + float v = MATS(V, n, i, j, v_stride); + MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride); + MATS(V, n, k, j, v_stride) = v; + } + } + } +} + +#ifdef __KERNEL_SSE3__ +ccl_device_inline void math_vector_zero_sse(__m128 *A, int n) +{ + for(int i = 0; i < n; i++) { + A[i] = _mm_setzero_ps(); + } +} + +ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) = _mm_setzero_ps(); + } + } +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight)); + } + } +} + +ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +{ + for(int i = 0; i < n; i++) { + V[i] = _mm_add_ps(V[i], a[i]); + } +} + +ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a) +{ + for(int i = 0; i < n; i++) { + V[i] = _mm_mul_ps(V[i], a[i]); + } +} + +ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] = _mm_max_ps(a[i], b[i]); + } +} + +ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col)); + } + } +} +#endif + +#undef MAT + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_MATRIX_H__ */ diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp index cd3067f7650..f9c3b4bb139 100644 --- a/intern/cycles/util/util_path.cpp +++ b/intern/cycles/util/util_path.cpp @@ -768,9 +768,17 @@ bool path_remove(const string& path) return remove(path.c_str()) == 0; } -static string line_directive(const string& path, int line) +static string line_directive(const string& base, const string& path, int line) { string escaped_path = path; + /* First we make path relative. */ + if(string_startswith(escaped_path, base.c_str())) { + const string base_file = path_filename(base); + const size_t base_len = base.length(); + escaped_path = base_file + escaped_path.substr(base_len, + escaped_path.length() - base_len); + } + /* Second, we replace all unsafe characters. */ string_replace(escaped_path, "\"", "\\\""); string_replace(escaped_path, "\'", "\\\'"); string_replace(escaped_path, "\?", "\\\?"); @@ -778,13 +786,13 @@ static string line_directive(const string& path, int line) return string_printf("#line %d \"%s\"", line, escaped_path.c_str()); } - -string path_source_replace_includes(const string& source, - const string& path, - const string& source_filename) +static string path_source_replace_includes_recursive( + const string& base, + const string& source, + const string& source_filepath) { /* Our own little c preprocessor that replaces #includes with the file - * contents, to work around issue of opencl drivers not supporting + * contents, to work around issue of OpenCL drivers not supporting * include paths with spaces in them. */ @@ -799,23 +807,22 @@ string path_source_replace_includes(const string& source, if(string_startswith(token, "include")) { token = string_strip(token.substr(7, token.size() - 7)); if(token[0] == '"') { - size_t n_start = 1; - size_t n_end = token.find("\"", n_start); - string filename = token.substr(n_start, n_end - n_start); - string text, filepath = path_join(path, filename); + const size_t n_start = 1; + const size_t n_end = token.find("\"", n_start); + const string filename = token.substr(n_start, n_end - n_start); + string filepath = path_join(base, filename); + if(!path_exists(filepath)) { + filepath = path_join(path_dirname(source_filepath), + filename); + } + string text; if(path_read_text(filepath, text)) { - /* Replace include directories with both current path - * and path extracted from the include file. - * Not totally robust, but works fine for Cycles kernel - * and avoids having list of include directories.x - */ - text = path_source_replace_includes( - text, path_dirname(filepath), filename); - text = path_source_replace_includes(text, path, filename); + text = path_source_replace_includes_recursive( + base, text, filepath); /* Use line directives for better error messages. */ - line = line_directive(filepath, 1) + line = line_directive(base, filepath, 1) + token.replace(0, n_end + 1, "\n" + text + "\n") - + line_directive(path_join(path, source_filename), i + 1); + + line_directive(base, source_filepath, i + 1); } } } @@ -826,6 +833,16 @@ string path_source_replace_includes(const string& source, return result; } +string path_source_replace_includes(const string& source, + const string& path, + const string& source_filename) +{ + return path_source_replace_includes_recursive( + path, + source, + path_join(path, source_filename)); +} + FILE *path_fopen(const string& path, const string& mode) { #ifdef _WIN32 diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index 39c1eed04e7..134383e88db 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -37,9 +37,11 @@ public: pixel_samples = 0; total_pixel_samples = 0; current_tile_sample = 0; - finished_tiles = 0; + rendered_tiles = 0; + denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + end_time = 0.0; status = "Initializing"; substatus = ""; sync_status = ""; @@ -75,9 +77,11 @@ public: pixel_samples = 0; total_pixel_samples = 0; current_tile_sample = 0; - finished_tiles = 0; + rendered_tiles = 0; + denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + end_time = 0.0; status = "Initializing"; substatus = ""; sync_status = ""; @@ -144,6 +148,7 @@ public: thread_scoped_lock lock(progress_mutex); start_time = time_dt(); + end_time = 0.0; } void set_render_start_time() @@ -167,8 +172,15 @@ public: { thread_scoped_lock lock(progress_mutex); - total_time_ = time_dt() - start_time; - render_time_ = time_dt() - render_start_time; + double time = (end_time > 0) ? end_time : time_dt(); + + total_time_ = time - start_time; + render_time_ = time - render_start_time; + } + + void set_end_time() + { + end_time = time_dt(); } void reset_sample() @@ -177,7 +189,8 @@ public: pixel_samples = 0; current_tile_sample = 0; - finished_tiles = 0; + rendered_tiles = 0; + denoised_tiles = 0; } void set_total_pixel_samples(uint64_t total_pixel_samples_) @@ -209,23 +222,36 @@ public: set_update(); } - void add_finished_tile() + void add_finished_tile(bool denoised) { thread_scoped_lock lock(progress_mutex); - finished_tiles++; + if(denoised) { + denoised_tiles++; + } + else { + rendered_tiles++; + } } int get_current_sample() { + thread_scoped_lock lock(progress_mutex); /* Note that the value here always belongs to the last tile that updated, * so it's only useful if there is only one active tile. */ return current_tile_sample; } - int get_finished_tiles() + int get_rendered_tiles() + { + thread_scoped_lock lock(progress_mutex); + return rendered_tiles; + } + + int get_denoised_tiles() { - return finished_tiles; + thread_scoped_lock lock(progress_mutex); + return denoised_tiles; } /* status messages */ @@ -318,9 +344,11 @@ protected: int current_tile_sample; /* Stores the number of tiles that's already finished. * Used to determine whether all but the last tile are finished rendering, in which case the current_tile_sample is displayed. */ - int finished_tiles; + int rendered_tiles, denoised_tiles; double start_time, render_start_time; + /* End time written when render is done, so it doesn't keep increasing on redraws. */ + double end_time; string status; string substatus; diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 557809a5719..587febe3e52 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -331,9 +331,9 @@ __forceinline size_t __bscf(size_t& v) static const unsigned int BITSCAN_NO_BIT_SET_32 = 32; static const size_t BITSCAN_NO_BIT_SET_64 = 64; +#ifdef __KERNEL_SSE3__ /* Emulation of SSE4 functions with SSE3 */ - -#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__) +# ifndef __KERNEL_SSE41__ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -341,42 +341,48 @@ static const size_t BITSCAN_NO_BIT_SET_64 = 64; #define _MM_FROUND_TO_ZERO 0x03 #define _MM_FROUND_CUR_DIRECTION 0x04 +#undef _mm_blendv_ps #define _mm_blendv_ps __emu_mm_blendv_ps __forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); } +#undef _mm_blend_ps #define _mm_blend_ps __emu_mm_blend_ps __forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); } +#undef _mm_blendv_epi8 #define _mm_blendv_epi8 __emu_mm_blendv_epi8 __forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); } +#undef _mm_mullo_epi32 #define _mm_mullo_epi32 __emu_mm_mullo_epi32 __forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) { __m128i rvalue; char* _r = (char*)(&rvalue + 1); char* _v = (char*)(& value + 1); char* _i = (char*)(& input + 1); - for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))* *((int32*)(_i + i)); + for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))* *((int32_t*)(_i + i)); return rvalue; } - +#undef _mm_min_epi32 #define _mm_min_epi32 __emu_mm_min_epi32 __forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); } +#undef _mm_max_epi32 #define _mm_max_epi32 __emu_mm_max_epi32 __forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); } +#undef _mm_extract_epi32 #define _mm_extract_epi32 __emu_mm_extract_epi32 __forceinline int _mm_extract_epi32( __m128i input, const int index ) { switch ( index ) { @@ -388,20 +394,24 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) { } } +#undef _mm_insert_epi32 #define _mm_insert_epi32 __emu_mm_insert_epi32 __forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; } +#undef _mm_extract_ps #define _mm_extract_ps __emu_mm_extract_ps __forceinline int _mm_extract_ps( __m128 input, const int index ) { - int32* ptr = (int32*)&input; return ptr[index]; + int32_t* ptr = (int32_t*)&input; return ptr[index]; } +#undef _mm_insert_ps #define _mm_insert_ps __emu_mm_insert_ps __forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index ) { assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); } +#undef _mm_round_ps #define _mm_round_ps __emu_mm_round_ps __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) { @@ -415,18 +425,55 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) return value; } -#ifdef _M_X64 +# ifdef _M_X64 +#undef _mm_insert_epi64 #define _mm_insert_epi64 __emu_mm_insert_epi64 __forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; } +#undef _mm_extract_epi64 #define _mm_extract_epi64 __emu_mm_extract_epi64 __forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { assert(size_t(index) < 2); return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); } -#endif +# endif + +# endif + +#undef _mm_fabs_ps +#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))) + +/* Return a __m128 with every element set to the largest element of v. */ +ccl_device_inline __m128 _mm_hmax_ps(__m128 v) +{ + /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */ + v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v)); + /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */ + v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v)); + return v; +} + +/* Return the sum of the four elements of x. */ +ccl_device_inline float _mm_hsum_ss(__m128 x) +{ + __m128 a = _mm_movehdup_ps(x); + __m128 b = _mm_add_ps(x, a); + return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b)); +} + +/* Return a __m128 with every element set to the sum of the four elements of x. */ +ccl_device_inline __m128 _mm_hsum_ps(__m128 x) +{ + x = _mm_hadd_ps(x, x); + x = _mm_hadd_ps(x, x); + return x; +} + +/* Replace elements of x with zero where mask isn't set. */ +#undef _mm_mask_ps +#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask) #endif diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp index a1008d510d1..94ad512982c 100644 --- a/intern/cycles/util/util_string.cpp +++ b/intern/cycles/util/util_string.cpp @@ -148,6 +148,12 @@ void string_replace(string& haystack, const string& needle, const string& other) string string_remove_trademark(const string &s) { string result = s; + + /* Special case, so we don;t leave sequential spaces behind. */ + /* TODO(sergey): Consider using regex perhaps? */ + string_replace(result, " (TM)", ""); + string_replace(result, " (R)", ""); + string_replace(result, "(TM)", ""); string_replace(result, "(R)", ""); diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp index fb0c34e1dc4..6ed97b0e0a6 100644 --- a/intern/cycles/util/util_task.cpp +++ b/intern/cycles/util/util_task.cpp @@ -206,9 +206,9 @@ void TaskScheduler::init(int num_threads) threads.resize(num_threads); const int num_groups = system_cpu_group_count(); - unsigned short num_process_groups; + unsigned short num_process_groups = 0; vector<unsigned short> process_groups; - int current_group_threads; + int current_group_threads = 0; if(num_groups > 1) { process_groups.resize(num_groups); num_process_groups = system_cpu_process_groups(num_groups, diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h index aff928ea2ee..df255f43059 100644 --- a/intern/cycles/util/util_texture.h +++ b/intern/cycles/util/util_texture.h @@ -21,62 +21,22 @@ CCL_NAMESPACE_BEGIN /* Texture limits on devices. */ -/* CPU */ -#define TEX_NUM_FLOAT4_CPU 1024 -#define TEX_NUM_BYTE4_CPU 1024 -#define TEX_NUM_HALF4_CPU 1024 -#define TEX_NUM_FLOAT_CPU 1024 -#define TEX_NUM_BYTE_CPU 1024 -#define TEX_NUM_HALF_CPU 1024 -#define TEX_START_FLOAT4_CPU 0 -#define TEX_START_BYTE4_CPU TEX_NUM_FLOAT4_CPU -#define TEX_START_HALF4_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU) -#define TEX_START_FLOAT_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU) -#define TEX_START_BYTE_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU) -#define TEX_START_HALF_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU + TEX_NUM_BYTE_CPU) - /* CUDA (Geforce 4xx and 5xx) */ -#define TEX_NUM_FLOAT4_CUDA 5 -#define TEX_NUM_BYTE4_CUDA 85 -#define TEX_NUM_HALF4_CUDA 0 -#define TEX_NUM_FLOAT_CUDA 0 -#define TEX_NUM_BYTE_CUDA 0 -#define TEX_NUM_HALF_CUDA 0 -#define TEX_START_FLOAT4_CUDA 0 -#define TEX_START_BYTE4_CUDA TEX_NUM_FLOAT4_CUDA -#define TEX_START_HALF4_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA) -#define TEX_START_FLOAT_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA) -#define TEX_START_BYTE_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA) -#define TEX_START_HALF_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA + TEX_NUM_BYTE_CUDA) - -/* CUDA (Kepler, Geforce 6xx and above) */ -#define TEX_NUM_FLOAT4_CUDA_KEPLER 1024 -#define TEX_NUM_BYTE4_CUDA_KEPLER 1024 -#define TEX_NUM_HALF4_CUDA_KEPLER 1024 -#define TEX_NUM_FLOAT_CUDA_KEPLER 1024 -#define TEX_NUM_BYTE_CUDA_KEPLER 1024 -#define TEX_NUM_HALF_CUDA_KEPLER 1024 -#define TEX_START_FLOAT4_CUDA_KEPLER 0 -#define TEX_START_BYTE4_CUDA_KEPLER TEX_NUM_FLOAT4_CUDA_KEPLER -#define TEX_START_HALF4_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER) -#define TEX_START_FLOAT_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER) -#define TEX_START_BYTE_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER) -#define TEX_START_HALF_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER + TEX_NUM_BYTE_CUDA_KEPLER) - -/* OpenCL */ -#define TEX_NUM_FLOAT4_OPENCL 1024 -#define TEX_NUM_BYTE4_OPENCL 1024 -#define TEX_NUM_HALF4_OPENCL 0 -#define TEX_NUM_FLOAT_OPENCL 1024 -#define TEX_NUM_BYTE_OPENCL 1024 -#define TEX_NUM_HALF_OPENCL 0 -#define TEX_START_FLOAT4_OPENCL 0 -#define TEX_START_BYTE4_OPENCL TEX_NUM_FLOAT4_OPENCL -#define TEX_START_HALF4_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL) -#define TEX_START_FLOAT_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL) -#define TEX_START_BYTE_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL) -#define TEX_START_HALF_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL + TEX_NUM_BYTE_OPENCL) - +#define TEX_NUM_FLOAT4_CUDA 5 +#define TEX_NUM_BYTE4_CUDA 84 +#define TEX_NUM_HALF4_CUDA 0 +#define TEX_NUM_FLOAT_CUDA 0 +#define TEX_NUM_BYTE_CUDA 0 +#define TEX_NUM_HALF_CUDA 0 +#define TEX_START_FLOAT4_CUDA 0 +#define TEX_START_BYTE4_CUDA TEX_NUM_FLOAT4_CUDA +#define TEX_START_HALF4_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA) +#define TEX_START_FLOAT_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA) +#define TEX_START_BYTE_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA) +#define TEX_START_HALF_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA + TEX_NUM_BYTE_CUDA) + +/* Any architecture other than old CUDA cards */ +#define TEX_NUM_MAX (INT_MAX >> 4) /* Color to use when textures are not found. */ #define TEX_IMAGE_MISSING_R 1 @@ -84,6 +44,14 @@ CCL_NAMESPACE_BEGIN #define TEX_IMAGE_MISSING_B 1 #define TEX_IMAGE_MISSING_A 1 +#if defined (__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300) +# define kernel_tex_type(tex) (tex < TEX_START_BYTE4_CUDA ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_BYTE4) +# define kernel_tex_index(tex) (tex) +#else +# define kernel_tex_type(tex) (tex & IMAGE_DATA_TYPE_MASK) +# define kernel_tex_index(tex) (tex >> IMAGE_DATA_TYPE_SHIFT) +#endif + CCL_NAMESPACE_END #endif /* __UTIL_TEXTURE_H__ */ diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index aa22f6a2c57..a5d1d7152d5 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -18,78 +18,75 @@ #define __UTIL_TYPES_H__ #ifndef __KERNEL_OPENCL__ - -#include <stdlib.h> - +# include <stdlib.h> #endif /* Bitness */ #if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) -#define __KERNEL_64_BIT__ +# define __KERNEL_64_BIT__ #endif /* Qualifiers for kernel code shared by CPU and GPU */ #ifndef __KERNEL_GPU__ - -#define ccl_device static inline -#define ccl_device_noinline static -#define ccl_global -#define ccl_constant -#define ccl_local -#define ccl_local_param -#define ccl_private -#define ccl_restrict __restrict -#define __KERNEL_WITH_SSE_ALIGN__ - -#if defined(_WIN32) && !defined(FREE_WINDOWS) -#define ccl_device_inline static __forceinline -#define ccl_device_forceinline static __forceinline -#define ccl_align(...) __declspec(align(__VA_ARGS__)) -#ifdef __KERNEL_64_BIT__ -#define ccl_try_align(...) __declspec(align(__VA_ARGS__)) -#else -#undef __KERNEL_WITH_SSE_ALIGN__ -#define ccl_try_align(...) /* not support for function arguments (error C2719) */ -#endif -#define ccl_may_alias -#define ccl_always_inline __forceinline -#define ccl_never_inline __declspec(noinline) -#define ccl_maybe_unused - -#else - -#define ccl_device_inline static inline __attribute__((always_inline)) -#define ccl_device_forceinline static inline __attribute__((always_inline)) -#define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) -#ifndef FREE_WINDOWS64 -#define __forceinline inline __attribute__((always_inline)) -#endif -#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) -#define ccl_may_alias __attribute__((__may_alias__)) -#define ccl_always_inline __attribute__((always_inline)) -#define ccl_never_inline __attribute__((noinline)) -#define ccl_maybe_unused __attribute__((used)) - -#endif - -#endif +# define ccl_device static inline +# define ccl_device_noinline static +# define ccl_global +# define ccl_constant +# define ccl_local +# define ccl_local_param +# define ccl_private +# define ccl_restrict __restrict +# define __KERNEL_WITH_SSE_ALIGN__ + +# if defined(_WIN32) && !defined(FREE_WINDOWS) +# define ccl_device_inline static __forceinline +# define ccl_device_forceinline static __forceinline +# define ccl_align(...) __declspec(align(__VA_ARGS__)) +# ifdef __KERNEL_64_BIT__ +# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) +# else /* __KERNEL_64_BIT__ */ +# undef __KERNEL_WITH_SSE_ALIGN__ +/* No support for function arguments (error C2719). */ +# define ccl_try_align(...) +# endif /* __KERNEL_64_BIT__ */ +# define ccl_may_alias +# define ccl_always_inline __forceinline +# define ccl_never_inline __declspec(noinline) +# define ccl_maybe_unused +# else /* _WIN32 && !FREE_WINDOWS */ +# define ccl_device_inline static inline __attribute__((always_inline)) +# define ccl_device_forceinline static inline __attribute__((always_inline)) +# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) +# ifndef FREE_WINDOWS64 +# define __forceinline inline __attribute__((always_inline)) +# endif +# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) +# define ccl_may_alias __attribute__((__may_alias__)) +# define ccl_always_inline __attribute__((always_inline)) +# define ccl_never_inline __attribute__((noinline)) +# define ccl_maybe_unused __attribute__((used)) +# endif /* _WIN32 && !FREE_WINDOWS */ + +/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ +# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ +# define ATTR_FALLTHROUGH __attribute__((fallthrough)) +# else +# define ATTR_FALLTHROUGH ((void)0) +# endif +#endif /* __KERNEL_GPU__ */ /* Standard Integer Types */ #ifndef __KERNEL_GPU__ - /* int8_t, uint16_t, and friends */ -#ifndef _WIN32 -#include <stdint.h> -#endif - +# ifndef _WIN32 +# include <stdint.h> +# endif /* SIMD Types */ - -#include "util/util_optimization.h" - -#endif +# include "util/util_optimization.h" +#endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_BEGIN @@ -102,24 +99,19 @@ CCL_NAMESPACE_BEGIN /* Shorter Unsigned Names */ #ifndef __KERNEL_OPENCL__ - typedef unsigned char uchar; typedef unsigned int uint; - +typedef unsigned short ushort; #endif /* Fixed Bits Types */ #ifdef __KERNEL_OPENCL__ - typedef ulong uint64_t; - #endif #ifndef __KERNEL_GPU__ - -#ifdef _WIN32 - +# ifdef _WIN32 typedef signed char int8_t; typedef unsigned char uint8_t; @@ -131,360 +123,26 @@ typedef unsigned int uint32_t; typedef long long int64_t; typedef unsigned long long uint64_t; - -#ifdef __KERNEL_64_BIT__ +# ifdef __KERNEL_64_BIT__ typedef int64_t ssize_t; -#else +# else typedef int32_t ssize_t; -#endif - -#endif +# endif +# endif /* _WIN32 */ /* Generic Memory Pointer */ typedef uint64_t device_ptr; +#endif /* __KERNEL_GPU__ */ -/* Vector Types */ - -struct uchar2 { - uchar x, y; - - __forceinline uchar operator[](int i) const { return *(&x + i); } - __forceinline uchar& operator[](int i) { return *(&x + i); } -}; - -struct uchar3 { - uchar x, y, z; - - __forceinline uchar operator[](int i) const { return *(&x + i); } - __forceinline uchar& operator[](int i) { return *(&x + i); } -}; - -struct uchar4 { - uchar x, y, z, w; - - __forceinline uchar operator[](int i) const { return *(&x + i); } - __forceinline uchar& operator[](int i) { return *(&x + i); } -}; - -struct int2 { - int x, y; - - __forceinline int operator[](int i) const { return *(&x + i); } - __forceinline int& operator[](int i) { return *(&x + i); } -}; - -struct ccl_try_align(16) int3 { -#ifdef __KERNEL_SSE__ - union { - __m128i m128; - struct { int x, y, z, w; }; - }; - - __forceinline int3() {} - __forceinline int3(const __m128i& a) : m128(a) {} - __forceinline operator const __m128i&(void) const { return m128; } - __forceinline operator __m128i&(void) { return m128; } - - int3(const int3& a) { m128 = a.m128; } - int3& operator =(const int3& a) { m128 = a.m128; return *this; } -#else - int x, y, z, w; -#endif - - __forceinline int operator[](int i) const { return *(&x + i); } - __forceinline int& operator[](int i) { return *(&x + i); } -}; - -struct ccl_try_align(16) int4 { -#ifdef __KERNEL_SSE__ - union { - __m128i m128; - struct { int x, y, z, w; }; - }; - - __forceinline int4() {} - __forceinline int4(const __m128i& a) : m128(a) {} - __forceinline operator const __m128i&(void) const { return m128; } - __forceinline operator __m128i&(void) { return m128; } - - int4(const int4& a) : m128(a.m128) {} - int4& operator=(const int4& a) { m128 = a.m128; return *this; } -#else - int x, y, z, w; -#endif - - __forceinline int operator[](int i) const { return *(&x + i); } - __forceinline int& operator[](int i) { return *(&x + i); } -}; - -struct uint2 { - uint x, y; - - __forceinline uint operator[](uint i) const { return *(&x + i); } - __forceinline uint& operator[](uint i) { return *(&x + i); } -}; - -struct uint3 { - uint x, y, z; - - __forceinline uint operator[](uint i) const { return *(&x + i); } - __forceinline uint& operator[](uint i) { return *(&x + i); } -}; - -struct uint4 { - uint x, y, z, w; - - __forceinline uint operator[](uint i) const { return *(&x + i); } - __forceinline uint& operator[](uint i) { return *(&x + i); } -}; - -struct float2 { - float x, y; - - __forceinline float operator[](int i) const { return *(&x + i); } - __forceinline float& operator[](int i) { return *(&x + i); } -}; - -struct ccl_try_align(16) float3 { -#ifdef __KERNEL_SSE__ - union { - __m128 m128; - struct { float x, y, z, w; }; - }; - - __forceinline float3() {} - __forceinline float3(const __m128& a) : m128(a) {} - __forceinline operator const __m128&(void) const { return m128; } - __forceinline operator __m128&(void) { return m128; } - - __forceinline float3(const float3& a) : m128(a.m128) {} - __forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; } -#else - float x, y, z, w; -#endif - - __forceinline float operator[](int i) const { return *(&x + i); } - __forceinline float& operator[](int i) { return *(&x + i); } -}; - -struct ccl_try_align(16) float4 { -#ifdef __KERNEL_SSE__ - union { - __m128 m128; - struct { float x, y, z, w; }; - }; - - __forceinline float4() {} - __forceinline float4(const __m128& a) : m128(a) {} - __forceinline operator const __m128&(void) const { return m128; } - __forceinline operator __m128&(void) { return m128; } - - __forceinline float4(const float4& a) : m128(a.m128) {} - __forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; } - -#else - float x, y, z, w; -#endif - - __forceinline float operator[](int i) const { return *(&x + i); } - __forceinline float& operator[](int i) { return *(&x + i); } -}; - -template<typename T> -class vector3 -{ -public: - T x, y, z; - - ccl_always_inline vector3() {} - ccl_always_inline vector3(const T& a) - : x(a), y(a), z(a) {} - ccl_always_inline vector3(const T& x, const T& y, const T& z) - : x(x), y(y), z(z) {} -}; - -#endif - -#ifndef __KERNEL_GPU__ - -/* Vector Type Constructors - * - * OpenCL does not support C++ class, so we use these instead. */ - -ccl_device_inline uchar2 make_uchar2(uchar x, uchar y) -{ - uchar2 a = {x, y}; - return a; -} - -ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z) -{ - uchar3 a = {x, y, z}; - return a; -} - -ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w) -{ - uchar4 a = {x, y, z, w}; - return a; -} - -ccl_device_inline int2 make_int2(int x, int y) -{ - int2 a = {x, y}; - return a; -} - -ccl_device_inline int3 make_int3(int x, int y, int z) -{ -#ifdef __KERNEL_SSE__ - int3 a; - a.m128 = _mm_set_epi32(0, z, y, x); -#else - int3 a = {x, y, z, 0}; -#endif - - return a; -} - -ccl_device_inline int4 make_int4(int x, int y, int z, int w) -{ -#ifdef __KERNEL_SSE__ - int4 a; - a.m128 = _mm_set_epi32(w, z, y, x); -#else - int4 a = {x, y, z, w}; -#endif - - return a; -} - -ccl_device_inline uint2 make_uint2(uint x, uint y) -{ - uint2 a = {x, y}; - return a; -} - -ccl_device_inline uint3 make_uint3(uint x, uint y, uint z) -{ - uint3 a = {x, y, z}; - return a; -} - -ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w) -{ - uint4 a = {x, y, z, w}; - return a; -} - -ccl_device_inline float2 make_float2(float x, float y) -{ - float2 a = {x, y}; - return a; -} - -ccl_device_inline float3 make_float3(float x, float y, float z) -{ -#ifdef __KERNEL_SSE__ - float3 a; - a.m128 = _mm_set_ps(0.0f, z, y, x); -#else - float3 a = {x, y, z, 0.0f}; -#endif - - return a; -} - -ccl_device_inline float4 make_float4(float x, float y, float z, float w) -{ -#ifdef __KERNEL_SSE__ - float4 a; - a.m128 = _mm_set_ps(w, z, y, x); -#else - float4 a = {x, y, z, w}; -#endif - - return a; -} - -ccl_device_inline int3 make_int3(int i) -{ -#ifdef __KERNEL_SSE__ - int3 a; - a.m128 = _mm_set1_epi32(i); -#else - int3 a = {i, i, i, i}; -#endif - - return a; -} - -ccl_device_inline int4 make_int4(int i) -{ -#ifdef __KERNEL_SSE__ - int4 a; - a.m128 = _mm_set1_epi32(i); -#else - int4 a = {i, i, i, i}; -#endif - - return a; -} - -ccl_device_inline float3 make_float3(float f) -{ -#ifdef __KERNEL_SSE__ - float3 a; - a.m128 = _mm_set1_ps(f); -#else - float3 a = {f, f, f, f}; -#endif - - return a; -} - -ccl_device_inline float4 make_float4(float f) -{ -#ifdef __KERNEL_SSE__ - float4 a; - a.m128 = _mm_set1_ps(f); -#else - float4 a = {f, f, f, f}; -#endif - - return a; -} - -ccl_device_inline float4 make_float4(const int4& i) -{ -#ifdef __KERNEL_SSE__ - float4 a; - a.m128 = _mm_cvtepi32_ps(i.m128); -#else - float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w}; -#endif - - return a; -} - -ccl_device_inline int4 make_int4(const float3& f) +ccl_device_inline size_t align_up(size_t offset, size_t alignment) { -#ifdef __KERNEL_SSE__ - int4 a; - a.m128 = _mm_cvtps_epi32(f.m128); -#else - int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; -#endif - - return a; + return (offset + alignment - 1) & ~(alignment - 1); } -#endif - -ccl_device_inline size_t align_up(size_t offset, size_t alignment) +ccl_device_inline size_t divide_up(size_t x, size_t y) { - return (offset + alignment - 1) & ~(alignment - 1); + return (x + y - 1) / y; } ccl_device_inline size_t round_up(size_t x, size_t multiple) @@ -509,6 +167,25 @@ enum InterpolationType { INTERPOLATION_NUM_TYPES, }; +/* Texture types + * Since we store the type in the lower bits of a flat index, + * the shift and bit mask constant below need to be kept in sync. + */ + +enum ImageDataType { + IMAGE_DATA_TYPE_FLOAT4 = 0, + IMAGE_DATA_TYPE_BYTE4 = 1, + IMAGE_DATA_TYPE_HALF4 = 2, + IMAGE_DATA_TYPE_FLOAT = 3, + IMAGE_DATA_TYPE_BYTE = 4, + IMAGE_DATA_TYPE_HALF = 5, + + IMAGE_DATA_NUM_TYPES +}; + +#define IMAGE_DATA_TYPE_SHIFT 3 +#define IMAGE_DATA_TYPE_MASK 0x7 + /* Extension types for textures. * * Defines how the image is extrapolated past its original bounds. @@ -554,7 +231,7 @@ template<typename T> static inline T decltype_helper(T x) { return x; } * ... the compiler optimizes away the temp var */ #ifdef __GNUC__ #define CHECK_TYPE(var, type) { \ - TYPEOF(var) *__tmp; \ + TYPEOF(var) *__tmp; \ __tmp = (type *)NULL; \ (void)__tmp; \ } (void)0 @@ -576,5 +253,50 @@ template<typename T> static inline T decltype_helper(T x) { return x; } CCL_NAMESPACE_END +#ifndef __KERNEL_GPU__ +# include <cassert> +# define util_assert(statement) assert(statement) +#else +# define util_assert(statement) +#endif + +/* Vectorized types declaration. */ +#include "util/util_types_uchar2.h" +#include "util/util_types_uchar3.h" +#include "util/util_types_uchar4.h" + +#include "util/util_types_int2.h" +#include "util/util_types_int3.h" +#include "util/util_types_int4.h" + +#include "util/util_types_uint2.h" +#include "util/util_types_uint3.h" +#include "util/util_types_uint4.h" + +#include "util/util_types_float2.h" +#include "util/util_types_float3.h" +#include "util/util_types_float4.h" + +#include "util/util_types_vector3.h" + +/* Vectorized types implementation. */ +#include "util/util_types_uchar2_impl.h" +#include "util/util_types_uchar3_impl.h" +#include "util/util_types_uchar4_impl.h" + +#include "util/util_types_int2_impl.h" +#include "util/util_types_int3_impl.h" +#include "util/util_types_int4_impl.h" + +#include "util/util_types_uint2_impl.h" +#include "util/util_types_uint3_impl.h" +#include "util/util_types_uint4_impl.h" + +#include "util/util_types_float2_impl.h" +#include "util/util_types_float3_impl.h" +#include "util/util_types_float4_impl.h" + +#include "util/util_types_vector3_impl.h" + #endif /* __UTIL_TYPES_H__ */ diff --git a/intern/cycles/util/util_types_float2.h b/intern/cycles/util/util_types_float2.h new file mode 100644 index 00000000000..ec7a1f717a1 --- /dev/null +++ b/intern/cycles/util/util_types_float2.h @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT2_H__ +#define __UTIL_TYPES_FLOAT2_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct float2 { + float x, y; + + __forceinline float operator[](int i) const; + __forceinline float& operator[](int i); +}; + +ccl_device_inline float2 make_float2(float x, float y); +ccl_device_inline void print_float2(const char *label, const float2& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT2_H__ */ diff --git a/intern/cycles/util/util_types_float2_impl.h b/intern/cycles/util/util_types_float2_impl.h new file mode 100644 index 00000000000..782dda195eb --- /dev/null +++ b/intern/cycles/util/util_types_float2_impl.h @@ -0,0 +1,59 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT2_IMPL_H__ +#define __UTIL_TYPES_FLOAT2_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +__forceinline float float2::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +__forceinline float& float2::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +ccl_device_inline float2 make_float2(float x, float y) +{ + float2 a = {x, y}; + return a; +} + +ccl_device_inline void print_float2(const char *label, const float2& a) +{ + printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT2_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_float3.h b/intern/cycles/util/util_types_float3.h new file mode 100644 index 00000000000..28146ad04f7 --- /dev/null +++ b/intern/cycles/util/util_types_float3.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT3_H__ +#define __UTIL_TYPES_FLOAT3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct ccl_try_align(16) float3 { +#ifdef __KERNEL_SSE__ + union { + __m128 m128; + struct { float x, y, z, w; }; + }; + + __forceinline float3(); + __forceinline float3(const float3& a); + __forceinline explicit float3(const __m128& a); + + __forceinline operator const __m128&(void) const; + __forceinline operator __m128&(void); + + __forceinline float3& operator =(const float3& a); +#else /* __KERNEL_SSE__ */ + float x, y, z, w; +#endif /* __KERNEL_SSE__ */ + + __forceinline float operator[](int i) const; + __forceinline float& operator[](int i); +}; + +ccl_device_inline float3 make_float3(float f); +ccl_device_inline float3 make_float3(float x, float y, float z); +ccl_device_inline void print_float3(const char *label, const float3& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT3_H__ */ diff --git a/intern/cycles/util/util_types_float3_impl.h b/intern/cycles/util/util_types_float3_impl.h new file mode 100644 index 00000000000..45f61767d3f --- /dev/null +++ b/intern/cycles/util/util_types_float3_impl.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT3_IMPL_H__ +#define __UTIL_TYPES_FLOAT3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +#ifdef __KERNEL_SSE__ +__forceinline float3::float3() +{ +} + +__forceinline float3::float3(const float3& a) + : m128(a.m128) +{ +} + +__forceinline float3::float3(const __m128& a) + : m128(a) +{ +} + +__forceinline float3::operator const __m128&(void) const +{ + return m128; +} + +__forceinline float3::operator __m128&(void) +{ + return m128; +} + +__forceinline float3& float3::operator =(const float3& a) +{ + m128 = a.m128; + return *this; +} +#endif /* __KERNEL_SSE__ */ + +__forceinline float float3::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +__forceinline float& float3::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline float3 make_float3(float f) +{ +#ifdef __KERNEL_SSE__ + float3 a(_mm_set1_ps(f)); +#else + float3 a = {f, f, f, f}; +#endif + return a; +} + +ccl_device_inline float3 make_float3(float x, float y, float z) +{ +#ifdef __KERNEL_SSE__ + float3 a(_mm_set_ps(0.0f, z, y, x)); +#else + float3 a = {x, y, z, 0.0f}; +#endif + return a; +} + +ccl_device_inline void print_float3(const char *label, const float3& a) +{ + printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT3_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_float4.h b/intern/cycles/util/util_types_float4.h new file mode 100644 index 00000000000..a7d9abe1b95 --- /dev/null +++ b/intern/cycles/util/util_types_float4.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT4_H__ +#define __UTIL_TYPES_FLOAT4_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct int4; + +struct ccl_try_align(16) float4 { +#ifdef __KERNEL_SSE__ + union { + __m128 m128; + struct { float x, y, z, w; }; + }; + + __forceinline float4(); + __forceinline float4(const float4& a); + __forceinline explicit float4(const __m128& a); + + __forceinline operator const __m128&(void) const; + __forceinline operator __m128&(void); + + __forceinline float4& operator =(const float4& a); + +#else /* __KERNEL_SSE__ */ + float x, y, z, w; +#endif /* __KERNEL_SSE__ */ + + __forceinline float operator[](int i) const; + __forceinline float& operator[](int i); +}; + +ccl_device_inline float4 make_float4(float f); +ccl_device_inline float4 make_float4(float x, float y, float z, float w); +ccl_device_inline float4 make_float4(const int4& i); +ccl_device_inline void print_float4(const char *label, const float4& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT4_H__ */ diff --git a/intern/cycles/util/util_types_float4_impl.h b/intern/cycles/util/util_types_float4_impl.h new file mode 100644 index 00000000000..ff3ec4d4ecf --- /dev/null +++ b/intern/cycles/util/util_types_float4_impl.h @@ -0,0 +1,117 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT4_IMPL_H__ +#define __UTIL_TYPES_FLOAT4_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +#ifdef __KERNEL_SSE__ +__forceinline float4::float4() +{ +} + +__forceinline float4::float4(const float4& a) + : m128(a.m128) +{ +} + +__forceinline float4::float4(const __m128& a) + : m128(a) +{ +} + +__forceinline float4::operator const __m128&(void) const +{ + return m128; +} + +__forceinline float4::operator __m128&(void) +{ + return m128; +} + +__forceinline float4& float4::operator =(const float4& a) +{ + m128 = a.m128; + return *this; +} +#endif /* __KERNEL_SSE__ */ + +__forceinline float float4::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +__forceinline float& float4::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +ccl_device_inline float4 make_float4(float f) +{ +#ifdef __KERNEL_SSE__ + float4 a(_mm_set1_ps(f)); +#else + float4 a = {f, f, f, f}; +#endif + return a; +} + +ccl_device_inline float4 make_float4(float x, float y, float z, float w) +{ +#ifdef __KERNEL_SSE__ + float4 a(_mm_set_ps(w, z, y, x)); +#else + float4 a = {x, y, z, w}; +#endif + return a; +} + +ccl_device_inline float4 make_float4(const int4& i) +{ +#ifdef __KERNEL_SSE__ + float4 a(_mm_cvtepi32_ps(i.m128)); +#else + float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w}; +#endif + return a; +} + +ccl_device_inline void print_float4(const char *label, const float4& a) +{ + printf("%s: %.8f %.8f %.8f %.8f\n", + label, + (double)a.x, (double)a.y, (double)a.z, (double)a.w); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT4_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_int2.h b/intern/cycles/util/util_types_int2.h new file mode 100644 index 00000000000..82e860f89eb --- /dev/null +++ b/intern/cycles/util/util_types_int2.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT2_H__ +#define __UTIL_TYPES_INT2_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct int2 { + int x, y; + + __forceinline int operator[](int i) const; + __forceinline int& operator[](int i); +}; + +ccl_device_inline int2 make_int2(int x, int y); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT2_H__ */ diff --git a/intern/cycles/util/util_types_int2_impl.h b/intern/cycles/util/util_types_int2_impl.h new file mode 100644 index 00000000000..c7d3942e723 --- /dev/null +++ b/intern/cycles/util/util_types_int2_impl.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT2_IMPL_H__ +#define __UTIL_TYPES_INT2_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +int int2::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +int& int2::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +ccl_device_inline int2 make_int2(int x, int y) +{ + int2 a = {x, y}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT2_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_int3.h b/intern/cycles/util/util_types_int3.h new file mode 100644 index 00000000000..9d43b201c02 --- /dev/null +++ b/intern/cycles/util/util_types_int3.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT3_H__ +#define __UTIL_TYPES_INT3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct ccl_try_align(16) int3 { +#ifdef __KERNEL_SSE__ + union { + __m128i m128; + struct { int x, y, z, w; }; + }; + + __forceinline int3(); + __forceinline int3(const int3& a); + __forceinline explicit int3(const __m128i& a); + + __forceinline operator const __m128i&(void) const; + __forceinline operator __m128i&(void); + + __forceinline int3& operator =(const int3& a); +#else /* __KERNEL_SSE__ */ + int x, y, z, w; +#endif /* __KERNEL_SSE__ */ + + __forceinline int operator[](int i) const; + __forceinline int& operator[](int i); +}; + +ccl_device_inline int3 make_int3(int i); +ccl_device_inline int3 make_int3(int x, int y, int z); +ccl_device_inline void print_int3(const char *label, const int3& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT3_H__ */ diff --git a/intern/cycles/util/util_types_int3_impl.h b/intern/cycles/util/util_types_int3_impl.h new file mode 100644 index 00000000000..ada50c4812c --- /dev/null +++ b/intern/cycles/util/util_types_int3_impl.h @@ -0,0 +1,106 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT3_IMPL_H__ +#define __UTIL_TYPES_INT3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +#ifdef __KERNEL_SSE__ +__forceinline int3::int3() +{ +} + +__forceinline int3::int3(const __m128i& a) + : m128(a) +{ +} + +__forceinline int3::int3(const int3& a) + : m128(a.m128) +{ +} + +__forceinline int3::operator const __m128i&(void) const +{ + return m128; +} + +__forceinline int3::operator __m128i&(void) +{ + return m128; +} + +__forceinline int3& int3::operator =(const int3& a) +{ + m128 = a.m128; + return *this; +} +#endif /* __KERNEL_SSE__ */ + +__forceinline int int3::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +__forceinline int& int3::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline int3 make_int3(int i) +{ +#ifdef __KERNEL_SSE__ + int3 a(_mm_set1_epi32(i)); +#else + int3 a = {i, i, i, i}; +#endif + return a; +} + +ccl_device_inline int3 make_int3(int x, int y, int z) +{ +#ifdef __KERNEL_SSE__ + int3 a(_mm_set_epi32(0, z, y, x)); +#else + int3 a = {x, y, z, 0}; +#endif + + return a; +} + +ccl_device_inline void print_int3(const char *label, const int3& a) +{ + printf("%s: %d %d %d\n", label, a.x, a.y, a.z); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT3_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h new file mode 100644 index 00000000000..cdd0ecbdae5 --- /dev/null +++ b/intern/cycles/util/util_types_int4.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT4_H__ +#define __UTIL_TYPES_INT4_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ + +struct float3; + +struct ccl_try_align(16) int4 { +#ifdef __KERNEL_SSE__ + union { + __m128i m128; + struct { int x, y, z, w; }; + }; + + __forceinline int4(); + __forceinline int4(const int4& a); + __forceinline explicit int4(const __m128i& a); + + __forceinline operator const __m128i&(void) const; + __forceinline operator __m128i&(void); + + __forceinline int4& operator=(const int4& a); +#else /* __KERNEL_SSE__ */ + int x, y, z, w; +#endif /* __KERNEL_SSE__ */ + + __forceinline int operator[](int i) const; + __forceinline int& operator[](int i); +}; + +ccl_device_inline int4 make_int4(int i); +ccl_device_inline int4 make_int4(int x, int y, int z, int w); +ccl_device_inline int4 make_int4(const float3& f); +ccl_device_inline void print_int4(const char *label, const int4& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT4_H__ */ diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h new file mode 100644 index 00000000000..07cdc88f2dc --- /dev/null +++ b/intern/cycles/util/util_types_int4_impl.h @@ -0,0 +1,115 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT4_IMPL_H__ +#define __UTIL_TYPES_INT4_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +#ifdef __KERNEL_SSE__ +__forceinline int4::int4() +{ +} + +__forceinline int4::int4(const int4& a) + : m128(a.m128) +{ +} + +__forceinline int4::int4(const __m128i& a) + : m128(a) +{ +} + +__forceinline int4::operator const __m128i&(void) const +{ + return m128; +} + +__forceinline int4::operator __m128i&(void) +{ + return m128; +} + +__forceinline int4& int4::operator=(const int4& a) +{ + m128 = a.m128; + return *this; +} +#endif /* __KERNEL_SSE__ */ + +__forceinline int int4::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +__forceinline int& int4::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +ccl_device_inline int4 make_int4(int i) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_set1_epi32(i)); +#else + int4 a = {i, i, i, i}; +#endif + return a; +} + +ccl_device_inline int4 make_int4(int x, int y, int z, int w) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_set_epi32(w, z, y, x)); +#else + int4 a = {x, y, z, w}; +#endif + return a; +} + +ccl_device_inline int4 make_int4(const float3& f) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_cvtps_epi32(f.m128)); +#else + int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; +#endif + return a; +} + +ccl_device_inline void print_int4(const char *label, const int4& a) +{ + printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT4_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uchar2.h b/intern/cycles/util/util_types_uchar2.h new file mode 100644 index 00000000000..f618a2234ca --- /dev/null +++ b/intern/cycles/util/util_types_uchar2.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR2_H__ +#define __UTIL_TYPES_UCHAR2_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uchar2 { + uchar x, y; + + __forceinline uchar operator[](int i) const; + __forceinline uchar& operator[](int i); +}; + +ccl_device_inline uchar2 make_uchar2(uchar x, uchar y); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR2_H__ */ diff --git a/intern/cycles/util/util_types_uchar2_impl.h b/intern/cycles/util/util_types_uchar2_impl.h new file mode 100644 index 00000000000..d5f196d0ce0 --- /dev/null +++ b/intern/cycles/util/util_types_uchar2_impl.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR2_IMPL_H__ +#define __UTIL_TYPES_UCHAR2_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +uchar uchar2::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +uchar& uchar2::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +ccl_device_inline uchar2 make_uchar2(uchar x, uchar y) +{ + uchar2 a = {x, y}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR2_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uchar3.h b/intern/cycles/util/util_types_uchar3.h new file mode 100644 index 00000000000..1e3644e6fd6 --- /dev/null +++ b/intern/cycles/util/util_types_uchar3.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR3_H__ +#define __UTIL_TYPES_UCHAR3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uchar3 { + uchar x, y, z; + + __forceinline uchar operator[](int i) const; + __forceinline uchar& operator[](int i); +}; + +ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR3_H__ */ diff --git a/intern/cycles/util/util_types_uchar3_impl.h b/intern/cycles/util/util_types_uchar3_impl.h new file mode 100644 index 00000000000..611021efb7f --- /dev/null +++ b/intern/cycles/util/util_types_uchar3_impl.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR3_IMPL_H__ +#define __UTIL_TYPES_UCHAR3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +uchar uchar3::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +uchar& uchar3::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z) +{ + uchar3 a = {x, y, z}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR3_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uchar4.h b/intern/cycles/util/util_types_uchar4.h new file mode 100644 index 00000000000..3802cebbfb9 --- /dev/null +++ b/intern/cycles/util/util_types_uchar4.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR4_H__ +#define __UTIL_TYPES_UCHAR4_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uchar4 { + uchar x, y, z, w; + + __forceinline uchar operator[](int i) const; + __forceinline uchar& operator[](int i); +}; + +ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR4_H__ */ diff --git a/intern/cycles/util/util_types_uchar4_impl.h b/intern/cycles/util/util_types_uchar4_impl.h new file mode 100644 index 00000000000..03039f60c54 --- /dev/null +++ b/intern/cycles/util/util_types_uchar4_impl.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR4_IMPL_H__ +#define __UTIL_TYPES_UCHAR4_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +uchar uchar4::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +uchar& uchar4::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w) +{ + uchar4 a = {x, y, z, w}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR4_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uint2.h b/intern/cycles/util/util_types_uint2.h new file mode 100644 index 00000000000..c4a31899614 --- /dev/null +++ b/intern/cycles/util/util_types_uint2.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT2_H__ +#define __UTIL_TYPES_UINT2_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uint2 { + uint x, y; + + __forceinline uint operator[](uint i) const; + __forceinline uint& operator[](uint i); +}; + +ccl_device_inline uint2 make_uint2(uint x, uint y); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT2_H__ */ diff --git a/intern/cycles/util/util_types_uint2_impl.h b/intern/cycles/util/util_types_uint2_impl.h new file mode 100644 index 00000000000..b50ffa2667f --- /dev/null +++ b/intern/cycles/util/util_types_uint2_impl.h @@ -0,0 +1,48 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT2_IMPL_H__ +#define __UTIL_TYPES_UINT2_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +__forceinline uint uint2::operator[](uint i) const +{ + util_assert(i < 2); + return *(&x + i); +} + +__forceinline uint& uint2::operator[](uint i) +{ + util_assert(i < 2); + return *(&x + i); +} + +ccl_device_inline uint2 make_uint2(uint x, uint y) +{ + uint2 a = {x, y}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT2_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uint3.h b/intern/cycles/util/util_types_uint3.h new file mode 100644 index 00000000000..aeeecd2df06 --- /dev/null +++ b/intern/cycles/util/util_types_uint3.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT3_H__ +#define __UTIL_TYPES_UINT3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uint3 { + uint x, y, z; + + __forceinline uint operator[](uint i) const; + __forceinline uint& operator[](uint i); +}; + +ccl_device_inline uint3 make_uint3(uint x, uint y, uint z); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT3_H__ */ diff --git a/intern/cycles/util/util_types_uint3_impl.h b/intern/cycles/util/util_types_uint3_impl.h new file mode 100644 index 00000000000..26005d5baff --- /dev/null +++ b/intern/cycles/util/util_types_uint3_impl.h @@ -0,0 +1,48 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT3_IMPL_H__ +#define __UTIL_TYPES_UINT3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +__forceinline uint uint3::operator[](uint i) const +{ + util_assert(i < 3); + return *(&x + i); +} + +__forceinline uint& uint3::operator[](uint i) +{ + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline uint3 make_uint3(uint x, uint y, uint z) +{ + uint3 a = {x, y, z}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT3_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uint4.h b/intern/cycles/util/util_types_uint4.h new file mode 100644 index 00000000000..2d3a7bb85e4 --- /dev/null +++ b/intern/cycles/util/util_types_uint4.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT4_H__ +#define __UTIL_TYPES_UINT4_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uint4 { + uint x, y, z, w; + + __forceinline uint operator[](uint i) const; + __forceinline uint& operator[](uint i); +}; + +ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT4_H__ */ diff --git a/intern/cycles/util/util_types_uint4_impl.h b/intern/cycles/util/util_types_uint4_impl.h new file mode 100644 index 00000000000..6d48131a446 --- /dev/null +++ b/intern/cycles/util/util_types_uint4_impl.h @@ -0,0 +1,48 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT4_IMPL_H__ +#define __UTIL_TYPES_UINT4_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +__forceinline uint uint4::operator[](uint i) const +{ + util_assert(i < 3); + return *(&x + i); +} + +__forceinline uint& uint4::operator[](uint i) +{ + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w) +{ + uint4 a = {x, y, z, w}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT4_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_vector3.h b/intern/cycles/util/util_types_vector3.h new file mode 100644 index 00000000000..12acf9dc959 --- /dev/null +++ b/intern/cycles/util/util_types_vector3.h @@ -0,0 +1,41 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_VECTOR3_H__ +#define __UTIL_TYPES_VECTOR3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +template<typename T> +class vector3 +{ +public: + T x, y, z; + + __forceinline vector3(); + __forceinline vector3(const T& a); + __forceinline vector3(const T& x, const T& y, const T& z); +}; +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_VECTOR3_H__ */ diff --git a/intern/cycles/util/util_types_vector3_impl.h b/intern/cycles/util/util_types_vector3_impl.h new file mode 100644 index 00000000000..2f6b8368540 --- /dev/null +++ b/intern/cycles/util/util_types_vector3_impl.h @@ -0,0 +1,47 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_VECTOR3_IMPL_H__ +#define __UTIL_TYPES_VECTOR3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +template<typename T> +ccl_always_inline vector3<T>::vector3() +{ +} + +template<typename T> +ccl_always_inline vector3<T>::vector3(const T& a) + : x(a), y(a), z(a) +{ +} + +template<typename T> +ccl_always_inline vector3<T>::vector3(const T& x, const T& y, const T& z) + : x(x), y(y), z(z) +{ +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_VECTOR3_IMPL_H__ */ |