diff options
Diffstat (limited to 'intern')
286 files changed, 3906 insertions, 1418 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 0147a4306f4..873bbfa36fa 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -3,7 +3,14 @@ if(NOT WITH_BLENDER AND WITH_CYCLES_STANDALONE) set(CYCLES_INSTALL_PATH "") else() set(WITH_CYCLES_BLENDER ON) - set(CYCLES_INSTALL_PATH "scripts/addons/cycles") + # WINDOWS_PYTHON_DEBUG needs to write into the user addons folder since it will + # be started with --env-system-scripts pointing to the release folder, which will + # lack the cycles addon, and we don't want to write into it. + if(NOT WINDOWS_PYTHON_DEBUG) + set(CYCLES_INSTALL_PATH "scripts/addons/cycles") + else() + set(CYCLES_INSTALL_PATH "$ENV{appdata}/blender foundation/blender/${BLENDER_VERSION}/scripts/addons/cycles") + endif() endif() # External Libraries @@ -210,6 +217,15 @@ if(WITH_CYCLES_OSL) ) endif() +if(WITH_CYCLES_EMBREE) + add_definitions(-DWITH_EMBREE) + add_definitions(-DEMBREE_STATIC_LIB) + include_directories( + SYSTEM + ${EMBREE_INCLUDE_DIRS} + ) +endif() + if(WITH_CYCLES_OPENSUBDIV) add_definitions(-DWITH_OPENSUBDIV) include_directories( @@ -283,12 +299,19 @@ if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER)) set(MAX_MSVC 1910) elseif(${CUDA_VERSION} EQUAL "9.1") set(MAX_MSVC 1911) + elseif(${CUDA_VERSION} EQUAL "10.0") + set(MAX_MSVC 1999) endif() if(NOT MSVC_VERSION LESS ${MAX_MSVC} OR CMAKE_C_COMPILER_ID MATCHES "Clang") message(STATUS "nvcc not supported for this compiler version, using cycles_cubin_cc instead.") set(WITH_CYCLES_CUBIN_COMPILER ON) endif() unset(MAX_MSVC) + elseif(APPLE) + if(${XCODE_VERSION} VERSION_GREATER_EQUAL 10.0) + message(STATUS "nvcc not supported for this compiler version, using cycles_cubin_cc instead.") + set(WITH_CYCLES_CUBIN_COMPILER ON) + endif() endif() endif() diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt index 4fd551b33c2..2c1367a86dc 100644 --- a/intern/cycles/app/CMakeLists.txt +++ b/intern/cycles/app/CMakeLists.txt @@ -77,6 +77,9 @@ macro(cycles_target_link_libraries target) if(WITH_CYCLES_OSL) target_link_libraries(${target} ${OSL_LIBRARIES} ${LLVM_LIBRARIES}) endif() + if(WITH_CYCLES_EMBREE) + target_link_libraries(${target} ${EMBREE_LIBRARIES}) + endif() if(WITH_CYCLES_OPENSUBDIV) target_link_libraries(${target} ${OPENSUBDIV_LIBRARIES}) endif() @@ -144,6 +147,9 @@ if(WITH_CYCLES_CUBIN_COMPILER) target_link_libraries(cycles_cubin_cc extern_cuew ${OPENIMAGEIO_LIBRARIES} + ${OPENEXR_LIBRARIES} + ${PUGIXML_LIBRARIES} + ${BOOST_LIBRARIES} ${PLATFORM_LINKLIBS} ) if(NOT CYCLES_STANDALONE_REPOSITORY) diff --git a/intern/cycles/app/cycles_xml.h b/intern/cycles/app/cycles_xml.h index 6a48980d8ea..a7bc1895d4e 100644 --- a/intern/cycles/app/cycles_xml.h +++ b/intern/cycles/app/cycles_xml.h @@ -29,4 +29,4 @@ void xml_read_file(Scene *scene, const char *filepath); CCL_NAMESPACE_END -#endif /* __CYCLES_XML_H__ */ +#endif /* __CYCLES_XML_H__ */ diff --git a/intern/cycles/blender/CCL_api.h b/intern/cycles/blender/CCL_api.h index 233ffc8802c..b9750ad0c53 100644 --- a/intern/cycles/blender/CCL_api.h +++ b/intern/cycles/blender/CCL_api.h @@ -33,4 +33,4 @@ void CCL_logging_verbosity_set(int verbosity); } #endif -#endif /* __CCL_API_H__ */ +#endif /* __CCL_API_H__ */ diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 87dcbe486c7..23239ee4352 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -254,21 +254,32 @@ def register_passes(engine, scene, srl): if crl.use_pass_volume_indirect: engine.register_pass(scene, srl, "VolumeInd", 3, "RGB", 'COLOR') cscene = scene.cycles - if crl.use_denoising and crl.denoising_store_passes and not cscene.use_progressive_refine: - engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR') - engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR') - engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE') - engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE') - engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR') - engine.register_pass(scene, srl, "Denoising Image", 3, "RGB", 'COLOR') - engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR') - - clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect", - "denoising_glossy_direct", "denoising_glossy_indirect", - "denoising_transmission_direct", "denoising_transmission_indirect", - "denoising_subsurface_direct", "denoising_subsurface_indirect") - if any(getattr(crl, option) for option in clean_options): - engine.register_pass(scene, srl, "Denoising Clean", 3, "RGB", 'COLOR') + + if crl.use_pass_crypto_object: + for i in range(0, crl.pass_crypto_depth, 2): + engine.register_pass(scene, srl, "CryptoObject" + '{:02d}'.format(i), 4, "RGBA", 'COLOR') + if crl.use_pass_crypto_material: + for i in range(0, crl.pass_crypto_depth, 2): + engine.register_pass(scene, srl, "CryptoMaterial" + '{:02d}'.format(i), 4, "RGBA", 'COLOR') + if srl.cycles.use_pass_crypto_asset: + for i in range(0, srl.cycles.pass_crypto_depth, 2): + engine.register_pass(scene, srl, "CryptoAsset" + '{:02d}'.format(i), 4, "RGBA", 'COLOR') + + if crl.use_denoising or crl.denoising_store_passes: + engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR') + if crl.denoising_store_passes: + engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE') + engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE') + engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR') + clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect", + "denoising_glossy_direct", "denoising_glossy_indirect", + "denoising_transmission_direct", "denoising_transmission_indirect", + "denoising_subsurface_direct", "denoising_subsurface_indirect") + if any(getattr(crl, option) for option in clean_options): + engine.register_pass(scene, srl, "Denoising Clean", 3, "RGB", 'COLOR') diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 80b83c94012..d986ba8c7a8 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -547,6 +547,11 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): description="Use special type BVH optimized for hair (uses more ram but renders faster)", default=True, ) + cls.use_bvh_embree = BoolProperty( + name="Use Embree", + description="Use Embree as ray accelerator", + default=False, + ) cls.debug_bvh_time_steps = IntProperty( name="BVH Time Steps", description="Split BVH primitives by this number of time steps to speed up render time in cost of memory", @@ -1339,7 +1344,36 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): default=False, update=update_render_passes, ) - + cls.use_pass_crypto_object = BoolProperty( + name="Cryptomatte Object", + description="Render cryptomatte object pass, for isolating objects in compositing", + default=False, + update=update_render_passes, + ) + cls.use_pass_crypto_material = BoolProperty( + name="Cryptomatte Material", + description="Render cryptomatte material pass, for isolating materials in compositing", + default=False, + update=update_render_passes, + ) + cls.use_pass_crypto_asset = BoolProperty( + name="Cryptomatte Asset", + description="Render cryptomatte asset pass, for isolating groups of objects with the same parent", + default=False, + update=update_render_passes, + ) + cls.pass_crypto_depth = IntProperty( + name="Cryptomatte Levels", + description="Sets how many unique objects can be distinguished per pixel", + default=6, min=2, max=16, step=2, + update=update_render_passes, + ) + cls.pass_crypto_accurate = BoolProperty( + name="Cryptomatte Accurate", + description="Gerenate a more accurate Cryptomatte pass. CPU only, may render slower and use more memory", + default=True, + update=update_render_passes, + ) @classmethod def unregister(cls): del bpy.types.SceneRenderLayer.cycles diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 5edbcb19672..2f1adfe4178 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -17,6 +17,7 @@ # <pep8 compliant> import bpy +import _cycles from bpy.types import ( Panel, @@ -430,11 +431,18 @@ class CYCLES_RENDER_PT_performance(CyclesButtonsPanel, Panel): col.separator() col.label(text="Acceleration structure:") + if _cycles.with_embree: + row = col.row() + row.active = use_cpu(context) + row.prop(cscene, "use_bvh_embree") + row = col.row() col.prop(cscene, "debug_use_spatial_splits") - col.prop(cscene, "debug_use_hair_bvh") + row = col.row() + row.active = not cscene.use_bvh_embree or not _cycles.with_embree + row.prop(cscene, "debug_use_hair_bvh") row = col.row() - row.active = not cscene.debug_use_spatial_splits + row.active = not cscene.debug_use_spatial_splits and not cscene.use_bvh_embree row.prop(cscene, "debug_bvh_time_steps") col = layout.column() @@ -491,8 +499,6 @@ class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel): bl_options = {'DEFAULT_CLOSED'} def draw(self, context): - import _cycles - layout = self.layout scene = context.scene @@ -517,6 +523,8 @@ class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel): col.prop(rl, "use_pass_shadow") col.prop(rl, "use_pass_ambient_occlusion") col.separator() + col.prop(crl, "denoising_store_passes", text="Denoising Data") + col.separator() col.prop(rl, "pass_alpha_threshold") col = split.column() @@ -549,12 +557,6 @@ class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel): col.prop(rl, "use_pass_emit", text="Emission") col.prop(rl, "use_pass_environment") - if context.scene.cycles.feature_set == 'EXPERIMENTAL': - col.separator() - sub = col.column() - sub.active = crl.use_denoising - sub.prop(crl, "denoising_store_passes", text="Denoising") - col = layout.column() col.prop(crl, "pass_debug_render_time") if _cycles.with_cycles_debug: @@ -563,6 +565,17 @@ class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel): col.prop(crl, "pass_debug_bvh_intersections") col.prop(crl, "pass_debug_ray_bounces") + crl = rl.cycles + layout.label("Cryptomatte:") + row = layout.row(align=True) + row.prop(crl, "use_pass_crypto_object", text="Object", toggle=True) + row.prop(crl, "use_pass_crypto_material", text="Material", toggle=True) + row.prop(crl, "use_pass_crypto_asset", text="Asset", toggle=True) + row = layout.row(align=True) + row.prop(crl, "pass_crypto_depth") + row = layout.row(align=True) + row.active = use_cpu(context) + row.prop(crl, "pass_crypto_accurate", text="Accurate Mode") class CYCLES_RENDER_PT_views(CyclesButtonsPanel, Panel): bl_label = "Views" @@ -630,9 +643,8 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel): rl = rd.layers.active crl = rl.cycles - layout.active = crl.use_denoising - split = layout.split() + split.active = crl.use_denoising col = split.column() sub = col.column(align=True) @@ -647,24 +659,28 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel): layout.separator() row = layout.row() + row.active = crl.use_denoising or crl.denoising_store_passes row.label(text="Diffuse:") sub = row.row(align=True) sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True) sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True) row = layout.row() + row.active = crl.use_denoising or crl.denoising_store_passes row.label(text="Glossy:") sub = row.row(align=True) sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True) sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True) row = layout.row() + row.active = crl.use_denoising or crl.denoising_store_passes row.label(text="Transmission:") sub = row.row(align=True) sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True) sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True) row = layout.row() + row.active = crl.use_denoising or crl.denoising_store_passes row.label(text="Subsurface:") sub = row.row(align=True) sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True) diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp index 99313866e9e..94d5dc5ea3d 100644 --- a/intern/cycles/blender/blender_curves.cpp +++ b/intern/cycles/blender/blender_curves.cpp @@ -707,7 +707,7 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int if(diff == 0) { for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve]; curvekey++) { if(i < mesh->curve_keys.size()) { - mP[i] =CurveSegmentMotionCV(CData, sys, curve, curvekey); + mP[i] = CurveSegmentMotionCV(CData, sys, curve, curvekey); if(!have_motion) { /* unlike mesh coordinates, these tend to be slightly different * between frames due to particle transforms into/out of object @@ -718,7 +718,6 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int have_motion = true; } } - i++; } } diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp index d0f82e37662..3fca4efd097 100644 --- a/intern/cycles/blender/blender_logging.cpp +++ b/intern/cycles/blender/blender_logging.cpp @@ -22,7 +22,7 @@ void CCL_init_logging(const char *argv0) ccl::util_logging_init(argv0); } -void CCL_start_debug_logging(void) +void CCL_start_debug_logging() { ccl::util_logging_start(); } diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp index 35bf7beda41..a05c982b367 100644 --- a/intern/cycles/blender/blender_object.cpp +++ b/intern/cycles/blender/blender_object.cpp @@ -384,6 +384,23 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, object_updated = true; } + /* sync the asset name for Cryptomatte */ + BL::Object parent = b_ob.parent(); + ustring parent_name; + if(parent) { + while(parent.parent()) { + parent = parent.parent(); + } + parent_name = parent.name(); + } + else { + parent_name = b_ob.name(); + } + if(object->asset_name != parent_name) { + object->asset_name = parent_name; + object_updated = true; + } + /* object sync * transform comparison should not be needed, but duplis don't work perfect * in the depsgraph and may not signal changes, so this is a workaround */ @@ -404,8 +421,8 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, if(scene->need_motion() == Scene::MOTION_BLUR) { motion_steps = object_motion_steps(b_parent, b_ob); + mesh->motion_steps = motion_steps; if(motion_steps && object_use_deform_motion(b_parent, b_ob)) { - mesh->motion_steps = motion_steps; mesh->use_motion_blur = true; } } diff --git a/intern/cycles/blender/blender_object_cull.h b/intern/cycles/blender/blender_object_cull.h index 2147877a860..6e2a22438ec 100644 --- a/intern/cycles/blender/blender_object_cull.h +++ b/intern/cycles/blender/blender_object_cull.h @@ -46,4 +46,4 @@ private: CCL_NAMESPACE_END -#endif /* __BLENDER_OBJECT_CULL_H__ */ +#endif /* __BLENDER_OBJECT_CULL_H__ */ diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 4b01eb5f2d4..8b3bec56d1f 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -839,10 +839,18 @@ void *CCL_python_module_init() #ifdef WITH_NETWORK PyModule_AddObject(mod, "with_network", Py_True); Py_INCREF(Py_True); -#else /* WITH_NETWORK */ +#else /* WITH_NETWORK */ PyModule_AddObject(mod, "with_network", Py_False); Py_INCREF(Py_False); -#endif /* WITH_NETWORK */ +#endif /* WITH_NETWORK */ + +#ifdef WITH_EMBREE + PyModule_AddObject(mod, "with_embree", Py_True); + Py_INCREF(Py_True); +#else /* WITH_EMBREE */ + PyModule_AddObject(mod, "with_embree", Py_False); + Py_INCREF(Py_False); +#endif /* WITH_EMBREE */ return (void*)mod; } diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index a07131d04ae..75c7dcee05e 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -35,6 +35,7 @@ #include "util/util_function.h" #include "util/util_hash.h" #include "util/util_logging.h" +#include "util/util_murmurhash.h" #include "util/util_progress.h" #include "util/util_time.h" @@ -370,6 +371,17 @@ void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight) do_write_update_render_tile(rtile, false, false); } +static void add_cryptomatte_layer(BL::RenderResult& b_rr, string name, string manifest) +{ + string identifier = string_printf("%08x", util_murmur_hash3(name.c_str(), name.length(), 0)); + string prefix = "cryptomatte/" + identifier.substr(0, 7) + "/"; + + render_add_metadata(b_rr, prefix+"name", name); + render_add_metadata(b_rr, prefix+"hash", "MurmurHash3_32"); + render_add_metadata(b_rr, prefix+"conversion", "uint32_to_float32"); + render_add_metadata(b_rr, prefix+"manifest", manifest); +} + void BlenderSession::render() { /* set callback to write out render results */ @@ -405,17 +417,19 @@ void BlenderSession::render() BL::RenderLayer b_rlay = *b_single_rlay; /* add passes */ - array<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params); + vector<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params); buffer_params.passes = passes; PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles"); bool use_denoising = get_boolean(crl, "use_denoising"); + bool denoising_passes = use_denoising || get_boolean(crl, "denoising_store_passes"); session->tile_manager.schedule_denoising = use_denoising; - buffer_params.denoising_data_pass = use_denoising; + buffer_params.denoising_data_pass = denoising_passes; buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES); session->params.use_denoising = use_denoising; + session->params.denoising_passes = denoising_passes; session->params.denoising_radius = get_int(crl, "denoising_radius"); session->params.denoising_strength = get_float(crl, "denoising_strength"); session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength"); @@ -475,15 +489,28 @@ void BlenderSession::render() break; } + BL::RenderResult b_full_rr = b_engine.get_result(); if(is_single_layer) { - BL::RenderResult b_rr = b_engine.get_result(); string num_aa_samples = string_printf("%d", session->params.samples); - b_rr.stamp_data_add_field("Cycles Samples", num_aa_samples.c_str()); + render_add_metadata(b_full_rr, "Cycles Samples", num_aa_samples); /* TODO(sergey): Report whether we're doing resumable render * and also start/end sample if so. */ } + if(scene->film->cryptomatte_passes & CRYPT_OBJECT) { + add_cryptomatte_layer(b_full_rr, b_rlay_name+".CryptoObject", + scene->object_manager->get_cryptomatte_objects(scene)); + } + if(scene->film->cryptomatte_passes & CRYPT_MATERIAL) { + add_cryptomatte_layer(b_full_rr, b_rlay_name+".CryptoMaterial", + scene->shader_manager->get_cryptomatte_materials(scene)); + } + if(scene->film->cryptomatte_passes & CRYPT_ASSET) { + add_cryptomatte_layer(b_full_rr, b_rlay_name+".CryptoAsset", + scene->object_manager->get_cryptomatte_assets(scene)); + } + /* free result without merging */ end_render_result(b_engine, b_rr, true, true, false); @@ -700,7 +727,7 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr, bool read = false; if(pass_type != PASS_NONE) { /* copy pixels */ - read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]); + read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0], b_pass.name()); } else { int denoising_offset = BlenderSync::get_denoising_pass(b_pass); @@ -719,7 +746,7 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr, else { /* copy combined pass */ BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str())); - if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0])) + if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0], "Combined")) b_combined_pass.rect(&pixels[0]); } diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index 08f5c873bef..b8a9096b354 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -173,4 +173,4 @@ protected: CCL_NAMESPACE_END -#endif /* __BLENDER_SESSION_H__ */ +#endif /* __BLENDER_SESSION_H__ */ diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp index 3eefb92f6af..e33a6c20a52 100644 --- a/intern/cycles/blender/blender_shader.cpp +++ b/intern/cycles/blender/blender_shader.cpp @@ -635,8 +635,8 @@ static ShaderNode *add_node(Scene *scene, } } #else - (void)b_data; - (void)b_ntree; + (void) b_data; + (void) b_ntree; #endif } else if(b_node.is_a(&RNA_ShaderNodeTexImage)) { diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 5e47252e336..832847c179f 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -40,6 +40,8 @@ CCL_NAMESPACE_BEGIN +static const char *cryptomatte_prefix = "Crypto"; + /* Constructor */ BlenderSync::BlenderSync(BL::RenderEngine& b_engine, @@ -517,6 +519,9 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass) MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES); #endif MAP_PASS("Debug Render Time", PASS_RENDER_TIME); + if(string_startswith(name, cryptomatte_prefix)) { + return PASS_CRYPTOMATTE; + } #undef MAP_PASS return PASS_NONE; @@ -525,6 +530,9 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass) int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) { string name = b_pass.name(); + + if(name == "Noisy Image") return DENOISING_PASS_COLOR; + if(name.substr(0, 10) != "Denoising ") { return -1; } @@ -539,7 +547,6 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR); MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A); MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B); - MAP_PASS("Image", DENOISING_PASS_COLOR); MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR); MAP_PASS("Clean", DENOISING_PASS_CLEAN); #undef MAP_PASS @@ -547,11 +554,11 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) return -1; } -array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, - BL::SceneRenderLayer& b_srlay, - const SessionParams &session_params) +vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, + BL::SceneRenderLayer& b_srlay, + const SessionParams &session_params) { - array<Pass> passes; + vector<Pass> passes; Pass::add(PASS_COMBINED, passes); if(!session_params.device.advanced_shading) { @@ -571,22 +578,11 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, Pass::add(pass_type, passes); } - scene->film->denoising_flags = 0; PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles"); - if(get_boolean(crp, "denoising_store_passes") && - get_boolean(crp, "use_denoising")) - { - b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Image", 3, "RGB", b_srlay.name().c_str()); - b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str()); - + bool use_denoising = get_boolean(crp, "use_denoising"); + bool store_denoising_passes = get_boolean(crp, "denoising_store_passes"); + scene->film->denoising_flags = 0; + if(use_denoising || store_denoising_passes) { #define MAP_OPTION(name, flag) if(!get_boolean(crp, name)) scene->film->denoising_flags |= flag; MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR); MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND); @@ -597,9 +593,22 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, MAP_OPTION("denoising_subsurface_direct", DENOISING_CLEAN_SUBSURFACE_DIR); MAP_OPTION("denoising_subsurface_indirect", DENOISING_CLEAN_SUBSURFACE_IND); #undef MAP_OPTION + b_engine.add_pass("Noisy Image", 4, "RGBA", b_srlay.name().c_str()); + } + + if(store_denoising_passes) { + b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str()); if(scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) { - b_engine.add_pass("Denoising Clean", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Clean", 3, "RGB", b_srlay.name().c_str()); } } #ifdef __KERNEL_DEBUG__ @@ -633,6 +642,39 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, Pass::add(PASS_VOLUME_INDIRECT, passes); } + /* Cryptomatte stores two ID/weight pairs per RGBA layer. + * User facing paramter is the number of pairs. */ + int crypto_depth = min(16, get_int(crp, "pass_crypto_depth")) / 2; + scene->film->cryptomatte_depth = crypto_depth; + scene->film->cryptomatte_passes = CRYPT_NONE; + if(get_boolean(crp, "use_pass_crypto_object")) { + for(int i = 0; i < crypto_depth; ++i) { + string passname = cryptomatte_prefix + string_printf("Object%02d", i); + b_engine.add_pass(passname.c_str(), 4, "RGBA", b_srlay.name().c_str()); + Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str()); + } + scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_OBJECT); + } + if(get_boolean(crp, "use_pass_crypto_material")) { + for(int i = 0; i < crypto_depth; ++i) { + string passname = cryptomatte_prefix + string_printf("Material%02d", i); + b_engine.add_pass(passname.c_str(), 4, "RGBA", b_srlay.name().c_str()); + Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str()); + } + scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_MATERIAL); + } + if(get_boolean(crp, "use_pass_crypto_asset")) { + for(int i = 0; i < crypto_depth; ++i) { + string passname = cryptomatte_prefix + string_printf("Asset%02d", i); + b_engine.add_pass(passname.c_str(), 4, "RGBA", b_srlay.name().c_str()); + Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str()); + } + scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_ASSET); + } + if(get_boolean(crp, "pass_crypto_accurate") && scene->film->cryptomatte_passes != CRYPT_NONE) { + scene->film->cryptomatte_passes = (CryptomatteType)(scene->film->cryptomatte_passes | CRYPT_ACCURATE); + } + return passes; } @@ -689,6 +731,9 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene, params.bvh_layout = DebugFlags().cpu.bvh_layout; } +#ifdef WITH_EMBREE + params.bvh_layout = RNA_boolean_get(&cscene, "use_bvh_embree") ? BVH_LAYOUT_EMBREE : params.bvh_layout; +#endif return params; } diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h index 5e63f76033d..6d78f62c7d0 100644 --- a/intern/cycles/blender/blender_sync.h +++ b/intern/cycles/blender/blender_sync.h @@ -66,9 +66,9 @@ public: void **python_thread_state, const char *layer = 0); void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer); - array<Pass> sync_render_passes(BL::RenderLayer& b_rlay, - BL::SceneRenderLayer& b_srlay, - const SessionParams &session_params); + vector<Pass> sync_render_passes(BL::RenderLayer& b_rlay, + BL::SceneRenderLayer& b_srlay, + const SessionParams &session_params); void sync_integrator(); void sync_camera(BL::RenderSettings& b_render, BL::Object& b_override, @@ -213,4 +213,4 @@ private: CCL_NAMESPACE_END -#endif /* __BLENDER_SYNC_H__ */ +#endif /* __BLENDER_SYNC_H__ */ diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h index 7e61888348b..eb7019f45bc 100644 --- a/intern/cycles/blender/blender_util.h +++ b/intern/cycles/blender/blender_util.h @@ -20,6 +20,7 @@ #include "render/mesh.h" #include "util/util_algorithm.h" +#include "util/util_array.h" #include "util/util_map.h" #include "util/util_path.h" #include "util/util_set.h" @@ -243,6 +244,12 @@ static inline float *image_get_float_pixels_for_frame(BL::Image& image, return BKE_image_get_float_pixels_for_frame(image.ptr.data, frame); } +static inline void render_add_metadata(BL::RenderResult& b_rr, string name, string value) +{ + b_rr.stamp_data_add_field(name.c_str(), value.c_str()); +} + + /* Utilities */ static inline Transform get_transform(const BL::Array<float, 16>& array) @@ -832,4 +839,4 @@ protected: CCL_NAMESPACE_END -#endif /* __BLENDER_UTIL_H__ */ +#endif /* __BLENDER_UTIL_H__ */ diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index fcd28572fdf..6014624f395 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -13,6 +13,7 @@ set(SRC bvh8.cpp bvh_binning.cpp bvh_build.cpp + bvh_embree.cpp bvh_node.cpp bvh_sort.cpp bvh_split.cpp @@ -26,6 +27,7 @@ set(SRC_HEADERS bvh8.h bvh_binning.h bvh_build.h + bvh_embree.h bvh_node.h bvh_params.h bvh_sort.h diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index bc73a3ad264..ac0614e3659 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -26,6 +26,10 @@ #include "bvh/bvh_build.h" #include "bvh/bvh_node.h" +#ifdef WITH_EMBREE +#include "bvh/bvh_embree.h" +#endif + #include "util/util_foreach.h" #include "util/util_logging.h" #include "util/util_progress.h" @@ -41,6 +45,7 @@ const char *bvh_layout_name(BVHLayout layout) case BVH_LAYOUT_BVH4: return "BVH4"; case BVH_LAYOUT_BVH8: return "BVH8"; case BVH_LAYOUT_NONE: return "NONE"; + case BVH_LAYOUT_EMBREE: return "EMBREE"; case BVH_LAYOUT_ALL: return "ALL"; } LOG(DFATAL) << "Unsupported BVH layout was passed."; @@ -96,6 +101,10 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects) return new BVH4(params, objects); case BVH_LAYOUT_BVH8: return new BVH8(params, objects); + case BVH_LAYOUT_EMBREE: +#ifdef WITH_EMBREE + return new BVHEmbree(params, objects); +#endif case BVH_LAYOUT_NONE: case BVH_LAYOUT_ALL: break; @@ -106,7 +115,7 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects) /* Building */ -void BVH::build(Progress& progress) +void BVH::build(Progress& progress, Stats*) { progress.set_substatus("Building BVH"); diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h index 86be0bae4be..c8ad29004d7 100644 --- a/intern/cycles/bvh/bvh.h +++ b/intern/cycles/bvh/bvh.h @@ -19,12 +19,13 @@ #define __BVH_H__ #include "bvh/bvh_params.h" - +#include "util/util_array.h" #include "util/util_types.h" #include "util/util_vector.h" CCL_NAMESPACE_BEGIN +class Stats; class BVHNode; struct BVHStackEntry; class BVHParams; @@ -35,7 +36,6 @@ class Progress; #define BVH_ALIGN 4096 #define TRI_NODE_SIZE 3 - /* Packed BVH * * BVH stored as it will be used for traversal on the rendering device. */ @@ -91,7 +91,7 @@ public: static BVH *create(const BVHParams& params, const vector<Object*>& objects); virtual ~BVH() {} - void build(Progress& progress); + virtual void build(Progress& progress, Stats *stats=NULL); void refit(Progress& progress); protected: @@ -126,4 +126,4 @@ struct BVHStackEntry CCL_NAMESPACE_END -#endif /* __BVH_H__ */ +#endif /* __BVH_H__ */ diff --git a/intern/cycles/bvh/bvh2.h b/intern/cycles/bvh/bvh2.h index df65ddca5b7..ecc697567bb 100644 --- a/intern/cycles/bvh/bvh2.h +++ b/intern/cycles/bvh/bvh2.h @@ -84,4 +84,4 @@ protected: CCL_NAMESPACE_END -#endif /* __BVH2_H__ */ +#endif /* __BVH2_H__ */ diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h index 310909a37e1..28bab2fe327 100644 --- a/intern/cycles/bvh/bvh4.h +++ b/intern/cycles/bvh/bvh4.h @@ -84,4 +84,4 @@ protected: CCL_NAMESPACE_END -#endif /* __BVH4_H__ */ +#endif /* __BVH4_H__ */ diff --git a/intern/cycles/bvh/bvh8.cpp b/intern/cycles/bvh/bvh8.cpp index 70d003d938a..b95fe572e27 100644 --- a/intern/cycles/bvh/bvh8.cpp +++ b/intern/cycles/bvh/bvh8.cpp @@ -124,6 +124,7 @@ void BVH8::pack_aligned_node(int idx, data[0].a = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED); data[0].b = time_from; data[0].c = time_to; + for(int i = 0; i < num; i++) { float3 bb_min = bounds[i].min; float3 bb_max = bounds[i].max; @@ -140,8 +141,8 @@ void BVH8::pack_aligned_node(int idx, for(int i = num; i < 8; i++) { /* We store BB which would never be recorded as intersection - * so kernel might safely assume there are always 4 child nodes. - */ + * so kernel might safely assume there are always 4 child nodes. + */ data[1][i] = FLT_MAX; data[2][i] = -FLT_MAX; @@ -153,6 +154,7 @@ void BVH8::pack_aligned_node(int idx, data[7][i] = __int_as_float(0); } + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_ONODE_SIZE); } @@ -189,6 +191,7 @@ void BVH8::pack_unaligned_node(int idx, { float8 data[BVH_UNALIGNED_ONODE_SIZE]; memset(data, 0, sizeof(data)); + data[0].a = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED); data[0].b = time_from; data[0].c = time_to; @@ -222,21 +225,21 @@ void BVH8::pack_unaligned_node(int idx, * so kernel might safely assume there are always 4 child nodes. */ - data[1][i] = 1.0f; - data[2][i] = 0.0f; - data[3][i] = 0.0f; + data[1][i] = NAN; + data[2][i] = NAN; + data[3][i] = NAN; - data[4][i] = 0.0f; - data[5][i] = 0.0f; - data[6][i] = 0.0f; + data[4][i] = NAN; + data[5][i] = NAN; + data[6][i] = NAN; - data[7][i] = 0.0f; - data[8][i] = 0.0f; - data[9][i] = 0.0f; + data[7][i] = NAN; + data[8][i] = NAN; + data[9][i] = NAN; - data[10][i] = -FLT_MAX; - data[11][i] = -FLT_MAX; - data[12][i] = -FLT_MAX; + data[10][i] = NAN; + data[11][i] = NAN; + data[12][i] = NAN; data[13][i] = __int_as_float(0); } diff --git a/intern/cycles/bvh/bvh8.h b/intern/cycles/bvh/bvh8.h index 274a2442c7e..834daf3abce 100644 --- a/intern/cycles/bvh/bvh8.h +++ b/intern/cycles/bvh/bvh8.h @@ -95,4 +95,4 @@ protected: CCL_NAMESPACE_END -#endif /* __BVH8_H__ */ +#endif /* __BVH8_H__ */ diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h index 7b245139819..dd95a5cc0e8 100644 --- a/intern/cycles/bvh/bvh_build.h +++ b/intern/cycles/bvh/bvh_build.h @@ -23,6 +23,7 @@ #include "bvh/bvh_params.h" #include "bvh/bvh_unaligned.h" +#include "util/util_array.h" #include "util/util_task.h" #include "util/util_vector.h" @@ -142,4 +143,4 @@ protected: CCL_NAMESPACE_END -#endif /* __BVH_BUILD_H__ */ +#endif /* __BVH_BUILD_H__ */ diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp new file mode 100644 index 00000000000..7489fe8ea42 --- /dev/null +++ b/intern/cycles/bvh/bvh_embree.cpp @@ -0,0 +1,884 @@ +/* + * Copyright 2018, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This class implemens a ray accelerator for Cycles using Intel's Embree library. + * It supports triangles, curves, object and deformation blur and instancing. + * Not supported are thick line segments, those have no native equivalent in Embree. + * They could be implemented using Embree's thick curves, at the expense of wasted memory. + * User defined intersections for Embree could also be an option, but since Embree only uses aligned BVHs + * for user geometry, this would come with reduced performance and/or higher memory usage. + * + * Since Embree allows object to be either curves or triangles but not both, Cycles object IDs are maapped + * to Embree IDs by multiplying by two and adding one for curves. + * + * This implementation shares RTCDevices between Cycles instances. Eventually each instance should get + * a separate RTCDevice to correctly keep track of memory usage. + * + * Vertex and index buffers are duplicated between Cycles device arrays and Embree. These could be merged, + * which would requrie changes to intersection refinement, shader setup, mesh light sampling and a few + * other places in Cycles where direct access to vertex data is required. + */ + +#ifdef WITH_EMBREE + +#include <pmmintrin.h> +#include <xmmintrin.h> +#include <embree3/rtcore_geometry.h> + +#include "bvh/bvh_embree.h" + +/* Kernel includes are necessary so that the filter function for Embree can access the packed BVH. */ +#include "kernel/bvh/bvh_embree.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_random.h" + +#include "render/mesh.h" +#include "render/object.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_progress.h" + +CCL_NAMESPACE_BEGIN + +#define IS_HAIR(x) (x & 1) + +/* This gets called by Embree at every valid ray/object intersection. + * Things like recording subsurface or shadow hits for later evaluation + * as well as filtering for volume objects happen here. + * Cycles' own BVH does that directly inside the traversal calls. + */ +static void rtc_filter_func(const RTCFilterFunctionNArguments *args) +{ + /* Current implementation in Cycles assumes only single-ray intersection queries. */ + assert(args->N == 1); + + const RTCRay *ray = (RTCRay*)args->ray; + const RTCHit *hit = (RTCHit*)args->hit; + CCLIntersectContext *ctx = ((IntersectContext*)args->context)->userRayExt; + KernelGlobals *kg = ctx->kg; + + /* Check if there is backfacing hair to ignore. */ + if(IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + && !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING) + && !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) { + if(dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) { + *args->valid = 0; + return; + } + } +} + +static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments* args) +{ + assert(args->N == 1); + + const RTCRay *ray = (RTCRay*)args->ray; + RTCHit *hit = (RTCHit*)args->hit; + CCLIntersectContext *ctx = ((IntersectContext*)args->context)->userRayExt; + KernelGlobals *kg = ctx->kg; + + /* For all ray types: Check if there is backfacing hair to ignore */ + if(IS_HAIR(hit->geomID) && (kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) + && !(kernel_data.curve.curveflags & CURVE_KN_BACKFACING) + && !(kernel_data.curve.curveflags & CURVE_KN_RIBBONS)) { + if(dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) { + *args->valid = 0; + return; + } + } + + switch(ctx->type) { + case CCLIntersectContext::RAY_SHADOW_ALL: { + /* Append the intersection to the end of the array. */ + if(ctx->num_hits < ctx->max_hits) { + Intersection current_isect; + kernel_embree_convert_hit(kg, ray, hit, ¤t_isect); + for(size_t i = 0; i < ctx->max_hits; ++i) { + if(current_isect.object == ctx->isect_s[i].object && + current_isect.prim == ctx->isect_s[i].prim && + current_isect.t == ctx->isect_s[i].t) { + /* This intersection was already recorded, skip it. */ + *args->valid = 0; + break; + } + } + Intersection *isect = &ctx->isect_s[ctx->num_hits]; + ++ctx->num_hits; + *isect = current_isect; + int prim = kernel_tex_fetch(__prim_index, isect->prim); + int shader = 0; + if(kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) { + shader = kernel_tex_fetch(__tri_shader, prim); + } + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } + int flag = kernel_tex_fetch(__shaders, shader & SHADER_MASK).flags; + /* If no transparent shadows, all light is blocked. */ + if(flag & (SD_HAS_TRANSPARENT_SHADOW)) { + /* This tells Embree to continue tracing. */ + *args->valid = 0; + } + } + else { + /* Increase the number of hits beyond ray.max_hits + * so that the caller can detect this as opaque. */ + ++ctx->num_hits; + } + break; + } + case CCLIntersectContext::RAY_SSS: { + /* No intersection information requested, just return a hit. */ + if(ctx->max_hits == 0) { + break; + } + + /* See triangle_intersect_subsurface() for the native equivalent. */ + for(int i = min(ctx->max_hits, ctx->ss_isect->num_hits) - 1; i >= 0; --i) { + if(ctx->ss_isect->hits[i].t == ray->tfar) { + /* This tells Embree to continue tracing. */ + *args->valid = 0; + break; + } + } + + ++ctx->ss_isect->num_hits; + int hit_idx; + + if(ctx->ss_isect->num_hits <= ctx->max_hits) { + hit_idx = ctx->ss_isect->num_hits - 1; + } + else { + /* reservoir sampling: if we are at the maximum number of + * hits, randomly replace element or skip it */ + hit_idx = lcg_step_uint(ctx->lcg_state) % ctx->ss_isect->num_hits; + + if(hit_idx >= ctx->max_hits) { + /* This tells Embree to continue tracing. */ + *args->valid = 0; + break; + } + } + /* record intersection */ + kernel_embree_convert_local_hit(kg, ray, hit, &ctx->ss_isect->hits[hit_idx], ctx->sss_object_id); + ctx->ss_isect->Ng[hit_idx].x = hit->Ng_x; + ctx->ss_isect->Ng[hit_idx].y = hit->Ng_y; + ctx->ss_isect->Ng[hit_idx].z = hit->Ng_z; + ctx->ss_isect->Ng[hit_idx] = normalize(ctx->ss_isect->Ng[hit_idx]); + /* This tells Embree to continue tracing .*/ + *args->valid = 0; + break; + } + case CCLIntersectContext::RAY_VOLUME_ALL: { + /* Append the intersection to the end of the array. */ + if(ctx->num_hits < ctx->max_hits) { + Intersection current_isect; + kernel_embree_convert_hit(kg, ray, hit, ¤t_isect); + for(size_t i = 0; i < ctx->max_hits; ++i) { + if(current_isect.object == ctx->isect_s[i].object && + current_isect.prim == ctx->isect_s[i].prim && + current_isect.t == ctx->isect_s[i].t) { + /* This intersection was already recorded, skip it. */ + *args->valid = 0; + break; + } + } + Intersection *isect = &ctx->isect_s[ctx->num_hits]; + ++ctx->num_hits; + *isect = current_isect; + /* Only primitives from volume object. */ + uint tri_object = (isect->object == OBJECT_NONE) ? + kernel_tex_fetch(__prim_object, isect->prim) : isect->object; + int object_flag = kernel_tex_fetch(__object_flag, tri_object); + if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + --ctx->num_hits; + } + /* This tells Embree to continue tracing. */ + *args->valid = 0; + break; + } + } + case CCLIntersectContext::RAY_REGULAR: + default: + /* Nothing to do here. */ + break; + } +} + +static size_t unaccounted_mem = 0; + +static bool rtc_memory_monitor_func(void* userPtr, const ssize_t bytes, const bool) +{ + Stats *stats = (Stats*)userPtr; + if(stats) { + if(bytes > 0) { + stats->mem_alloc(bytes); + } + else { + stats->mem_free(-bytes); + } + } + else { + /* A stats pointer may not yet be available. Keep track of the memory usage for later. */ + if(bytes >= 0) { + atomic_add_and_fetch_z(&unaccounted_mem, bytes); + } + else { + atomic_sub_and_fetch_z(&unaccounted_mem, -bytes); + } + } + return true; +} + +static void rtc_error_func(void*, enum RTCError, const char* str) +{ + VLOG(1) << str; +} + +static double progress_start_time = 0.0f; + +static bool rtc_progress_func(void* user_ptr, const double n) +{ + Progress *progress = (Progress*)user_ptr; + + if(time_dt() - progress_start_time < 0.25) { + return true; + } + + string msg = string_printf("Building BVH %.0f%%", n * 100.0); + progress->set_substatus(msg); + progress_start_time = time_dt(); + + return !progress->get_cancel(); +} + +/* This is to have a shared device between all BVH instances. + It would be useful to actually to use a separte RTCDevice per Cycles instance. */ +RTCDevice BVHEmbree::rtc_shared_device = NULL; +int BVHEmbree::rtc_shared_users = 0; +thread_mutex BVHEmbree::rtc_shared_mutex; + +BVHEmbree::BVHEmbree(const BVHParams& params_, const vector<Object*>& objects_) +: BVH(params_, objects_), scene(NULL), mem_used(0), top_level(NULL), stats(NULL), + curve_subdivisions(params.curve_subdivisions), build_quality(RTC_BUILD_QUALITY_REFIT), + use_curves(params_.curve_flags & CURVE_KN_INTERPOLATE), + use_ribbons(params.curve_flags & CURVE_KN_RIBBONS), dynamic_scene(true) +{ + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + thread_scoped_lock lock(rtc_shared_mutex); + if(rtc_shared_users == 0) { + rtc_shared_device = rtcNewDevice("verbose=0"); + /* Check here if Embree was built with the correct flags. */ + ssize_t ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED); + if(ret != 1) { + assert(0); + VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED flag."\ + "Ray visiblity will not work."; + } + ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED); + if(ret != 1) { + assert(0); + VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED flag."\ + "Renders may not look as expected."; + } + ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED); + if(ret != 1) { + assert(0); + VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED flag. "\ + "Line primitives will not be rendered."; + } + ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED); + if(ret != 1) { + assert(0); + VLOG(1) << "Embree is compiled without the RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED flag. "\ + "Triangle primitives will not be rendered."; + } + ret = rtcGetDeviceProperty (rtc_shared_device,RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED); + if(ret != 0) { + assert(0); + VLOG(1) << "Embree is compiled with the RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED flag. "\ + "Renders may not look as expected."; + } + } + ++rtc_shared_users; + + rtcSetDeviceErrorFunction(rtc_shared_device, rtc_error_func, NULL); + + pack.root_index = -1; +} + +BVHEmbree::~BVHEmbree() +{ + if(!params.top_level) { + destroy(scene); + } +} + +void BVHEmbree::destroy(RTCScene scene) +{ + if(scene) { + rtcReleaseScene(scene); + scene = NULL; + } + thread_scoped_lock lock(rtc_shared_mutex); + --rtc_shared_users; + if(rtc_shared_users == 0) { + rtcReleaseDevice (rtc_shared_device); + rtc_shared_device = NULL; + } +} + +void BVHEmbree::delete_rtcScene() +{ + if(scene) { + /* When this BVH is used as an instance in a top level BVH, don't delete now + * Let the top_level BVH know that it should delete it later. */ + if(top_level) { + top_level->add_delayed_delete_scene(scene); + } + else { + rtcReleaseScene(scene); + if(delayed_delete_scenes.size()) { + foreach(RTCScene s, delayed_delete_scenes) { + rtcReleaseScene(s); + } + } + delayed_delete_scenes.clear(); + } + scene = NULL; + } +} + +void BVHEmbree::build(Progress& progress, Stats *stats_) +{ + assert(rtc_shared_device); + stats = stats_; + rtcSetDeviceMemoryMonitorFunction(rtc_shared_device, rtc_memory_monitor_func, stats); + + progress.set_substatus("Building BVH"); + + if(scene) { + rtcReleaseScene(scene); + scene = NULL; + } + + const bool dynamic = params.bvh_type == SceneParams::BVH_DYNAMIC; + + scene = rtcNewScene(rtc_shared_device); + const RTCSceneFlags scene_flags = (dynamic ? RTC_SCENE_FLAG_DYNAMIC : RTC_SCENE_FLAG_NONE) | + RTC_SCENE_FLAG_COMPACT | RTC_SCENE_FLAG_ROBUST; + rtcSetSceneFlags(scene, scene_flags); + build_quality = dynamic ? RTC_BUILD_QUALITY_LOW : + (params.use_spatial_split ? RTC_BUILD_QUALITY_HIGH : RTC_BUILD_QUALITY_MEDIUM); + rtcSetSceneBuildQuality(scene, build_quality); + + int i = 0; + + pack.object_node.clear(); + + foreach(Object *ob, objects) { + if(params.top_level) { + if(!ob->is_traceable()) { + ++i; + continue; + } + if(!ob->mesh->is_instanced()) { + add_object(ob, i); + } + else { + add_instance(ob, i); + } + } + else { + add_object(ob, i); + } + ++i; + if(progress.get_cancel()) return; + } + + if(progress.get_cancel()) { + delete_rtcScene(); + stats = NULL; + return; + } + + rtcSetSceneProgressMonitorFunction(scene, rtc_progress_func, &progress); + rtcCommitScene(scene); + + pack_primitives(); + + if(progress.get_cancel()) { + delete_rtcScene(); + stats = NULL; + return; + } + + progress.set_substatus("Packing geometry"); + pack_nodes(NULL); + + stats = NULL; +} + +void BVHEmbree::add_object(Object *ob, int i) +{ + Mesh *mesh = ob->mesh; + if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) { + add_triangles(ob, i); + } + if(params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) { + add_curves(ob, i); + } +} + +void BVHEmbree::add_instance(Object *ob, int i) +{ + if(!ob || !ob->mesh) { + assert(0); + return; + } + BVHEmbree *instance_bvh = (BVHEmbree*)(ob->mesh->bvh); + + if(instance_bvh->top_level != this) { + instance_bvh->top_level = this; + } + + const size_t num_motion_steps = ob->use_motion() ? ob->motion.size() : 1; + RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, RTC_GEOMETRY_TYPE_INSTANCE); + rtcSetGeometryInstancedScene(geom_id, instance_bvh->scene); + rtcSetGeometryTimeStepCount(geom_id, num_motion_steps); + + if(ob->use_motion()) { + for(size_t step = 0; step < num_motion_steps; ++step) { + rtcSetGeometryTransform(geom_id, step, RTC_FORMAT_FLOAT3X4_ROW_MAJOR, (const float*)&ob->motion[step]); + } + } + else { + rtcSetGeometryTransform(geom_id, 0, RTC_FORMAT_FLOAT3X4_ROW_MAJOR, (const float*)&ob->tfm); + } + + pack.prim_index.push_back_slow(-1); + pack.prim_object.push_back_slow(i); + pack.prim_type.push_back_slow(PRIMITIVE_NONE); + pack.prim_tri_index.push_back_slow(-1); + + rtcSetGeometryUserData(geom_id, (void*) instance_bvh->scene); + rtcSetGeometryMask(geom_id, ob->visibility); + + rtcCommitGeometry(geom_id); + rtcAttachGeometryByID(scene, geom_id, i*2); + rtcReleaseGeometry(geom_id); +} + +void BVHEmbree::add_triangles(Object *ob, int i) +{ + size_t prim_offset = pack.prim_index.size(); + Mesh *mesh = ob->mesh; + const Attribute *attr_mP = NULL; + size_t num_motion_steps = 1; + if(mesh->has_motion_blur()) { + attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if(attr_mP) { + num_motion_steps = mesh->motion_steps; + if(num_motion_steps > RTC_MAX_TIME_STEP_COUNT) { + assert(0); + num_motion_steps = RTC_MAX_TIME_STEP_COUNT; + } + } + } + + const size_t num_triangles = mesh->num_triangles(); + RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, RTC_GEOMETRY_TYPE_TRIANGLE); + rtcSetGeometryBuildQuality(geom_id, build_quality); + rtcSetGeometryTimeStepCount(geom_id, num_motion_steps); + + unsigned *rtc_indices = (unsigned*)rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_INDEX, 0, + RTC_FORMAT_UINT3, sizeof (int) * 3, num_triangles); + assert(rtc_indices); + if(!rtc_indices) { + VLOG(1) << "Embree could not create new geometry buffer for mesh " << mesh->name.c_str() << ".\n"; + return; + } + for(size_t j = 0; j < num_triangles; ++j) { + Mesh::Triangle t = mesh->get_triangle(j); + rtc_indices[j*3] = t.v[0]; + rtc_indices[j*3+1] = t.v[1]; + rtc_indices[j*3+2] = t.v[2]; + } + + update_tri_vertex_buffer(geom_id, mesh); + + pack.prim_object.reserve(pack.prim_object.size() + num_triangles); + pack.prim_type.reserve(pack.prim_type.size() + num_triangles); + pack.prim_index.reserve(pack.prim_index.size() + num_triangles); + pack.prim_tri_index.reserve(pack.prim_index.size() + num_triangles); + for(size_t j = 0; j < num_triangles; ++j) { + pack.prim_object.push_back_reserved(i); + pack.prim_type.push_back_reserved(num_motion_steps > 1 ? PRIMITIVE_MOTION_TRIANGLE : PRIMITIVE_TRIANGLE); + pack.prim_index.push_back_reserved(j); + pack.prim_tri_index.push_back_reserved(j); + } + + rtcSetGeometryUserData(geom_id, (void*) prim_offset); + rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func); + rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func); + rtcSetGeometryMask(geom_id, ob->visibility); + + rtcCommitGeometry(geom_id); + rtcAttachGeometryByID(scene, geom_id, i*2); + rtcReleaseGeometry(geom_id); +} + +void BVHEmbree::update_tri_vertex_buffer(RTCGeometry geom_id, const Mesh* mesh) +{ + const Attribute *attr_mP = NULL; + size_t num_motion_steps = 1; + int t_mid = 0; + if(mesh->has_motion_blur()) { + attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if(attr_mP) { + num_motion_steps = mesh->motion_steps; + t_mid = (num_motion_steps - 1) / 2; + if(num_motion_steps > RTC_MAX_TIME_STEP_COUNT) { + assert(0); + num_motion_steps = RTC_MAX_TIME_STEP_COUNT; + } + } + } + const size_t num_verts = mesh->verts.size(); + + for(int t = 0; t < num_motion_steps; ++t) { + const float3 *verts; + if(t == t_mid) { + verts = &mesh->verts[0]; + } + else { + int t_ = (t > t_mid) ? (t - 1) : t; + verts = &attr_mP->data_float3()[t_ * num_verts]; + } + + float *rtc_verts = (float*) rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_VERTEX, t, + RTC_FORMAT_FLOAT3, sizeof(float) * 3, num_verts + 1); + assert(rtc_verts); + if(rtc_verts) { + for(size_t j = 0; j < num_verts; ++j) { + rtc_verts[0] = verts[j].x; + rtc_verts[1] = verts[j].y; + rtc_verts[2] = verts[j].z; + rtc_verts += 3; + } + } + } +} + +void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh* mesh) +{ + const Attribute *attr_mP = NULL; + size_t num_motion_steps = 1; + if(mesh->has_motion_blur()) { + attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if(attr_mP) { + num_motion_steps = mesh->motion_steps; + } + } + + const size_t num_curves = mesh->num_curves(); + size_t num_keys = 0; + for(size_t j = 0; j < num_curves; ++j) { + const Mesh::Curve c = mesh->get_curve(j); + num_keys += c.num_keys; + } + + /* Copy the CV data to Embree */ + const int t_mid = (num_motion_steps - 1) / 2; + const float *curve_radius = &mesh->curve_radius[0]; + for(int t = 0; t < num_motion_steps; ++t) { + const float3 *verts; + if(t == t_mid || attr_mP == NULL) { + verts = &mesh->curve_keys[0]; + } + else { + int t_ = (t > t_mid) ? (t - 1) : t; + verts = &attr_mP->data_float3()[t_ * num_keys]; + } + + float4 *rtc_verts = (float4*)rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_VERTEX, t, + RTC_FORMAT_FLOAT4, sizeof (float) * 4, num_keys); + float4 *rtc_tangents = NULL; + if(use_curves) { + rtc_tangents = (float4*)rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_TANGENT, t, + RTC_FORMAT_FLOAT4, sizeof (float) * 4, num_keys); + assert(rtc_tangents); + } + assert(rtc_verts); + if(rtc_verts) { + if(use_curves && rtc_tangents) { + const size_t num_curves = mesh->num_curves(); + for(size_t j = 0; j < num_curves; ++j) { + Mesh::Curve c = mesh->get_curve(j); + int fk = c.first_key; + rtc_verts[0] = float3_to_float4(verts[fk]); + rtc_verts[0].w = curve_radius[fk]; + rtc_tangents[0] = float3_to_float4(verts[fk + 1] - verts[fk]); + rtc_tangents[0].w = curve_radius[fk + 1] - curve_radius[fk]; + ++fk; + int k = 1; + for(;k < c.num_segments(); ++k, ++fk) { + rtc_verts[k] = float3_to_float4(verts[fk]); + rtc_verts[k].w = curve_radius[fk]; + rtc_tangents[k] = float3_to_float4((verts[fk + 1] - verts[fk - 1]) * 0.5f); + rtc_tangents[k].w = (curve_radius[fk + 1] - curve_radius[fk - 1]) * 0.5f; + } + rtc_verts[k] = float3_to_float4(verts[fk]); + rtc_verts[k].w = curve_radius[fk]; + rtc_tangents[k] = float3_to_float4(verts[fk] - verts[fk - 1]); + rtc_tangents[k].w = curve_radius[fk] - curve_radius[fk - 1]; + rtc_verts += c.num_keys; + rtc_tangents += c.num_keys; + } + } + else { + for(size_t j = 0; j < num_keys; ++j) { + rtc_verts[j] = float3_to_float4(verts[j]); + rtc_verts[j].w = curve_radius[j]; + } + } + } + } +} + +void BVHEmbree::add_curves(Object *ob, int i) +{ + size_t prim_offset = pack.prim_index.size(); + const Mesh *mesh = ob->mesh; + const Attribute *attr_mP = NULL; + size_t num_motion_steps = 1; + if(mesh->has_motion_blur()) { + attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + if(attr_mP) { + num_motion_steps = mesh->motion_steps; + } + } + + const size_t num_curves = mesh->num_curves(); + size_t num_segments = 0; + for(size_t j = 0; j < num_curves; ++j) { + Mesh::Curve c = mesh->get_curve(j); + assert(c.num_segments() > 0); + num_segments += c.num_segments(); + } + + /* Make room for Cycles specific data. */ + pack.prim_object.reserve(pack.prim_object.size() + num_segments); + pack.prim_type.reserve(pack.prim_type.size() + num_segments); + pack.prim_index.reserve(pack.prim_index.size() + num_segments); + pack.prim_tri_index.reserve(pack.prim_index.size() + num_segments); + + enum RTCGeometryType type = (!use_curves) ? RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE : + (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE : + RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE); + + RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, type); + rtcSetGeometryTessellationRate(geom_id, curve_subdivisions); + unsigned *rtc_indices = (unsigned*) rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_INDEX, 0, + RTC_FORMAT_UINT, sizeof (int), num_segments); + size_t rtc_index = 0; + for(size_t j = 0; j < num_curves; ++j) { + Mesh::Curve c = mesh->get_curve(j); + for(size_t k = 0; k < c.num_segments(); ++k) { + rtc_indices[rtc_index] = c.first_key + k; + /* Cycles specific data. */ + pack.prim_object.push_back_reserved(i); + pack.prim_type.push_back_reserved(PRIMITIVE_PACK_SEGMENT(num_motion_steps > 1 ? + PRIMITIVE_MOTION_CURVE : PRIMITIVE_CURVE, k)); + pack.prim_index.push_back_reserved(j); + pack.prim_tri_index.push_back_reserved(rtc_index); + + ++rtc_index; + } + } + + rtcSetGeometryBuildQuality(geom_id, build_quality); + rtcSetGeometryTimeStepCount(geom_id, num_motion_steps); + + update_curve_vertex_buffer(geom_id, mesh); + + rtcSetGeometryUserData(geom_id, (void*) prim_offset); + rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func); + rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func); + rtcSetGeometryMask(geom_id, ob->visibility); + + rtcCommitGeometry(geom_id); + rtcAttachGeometryByID(scene, geom_id, i * 2 + 1); + rtcReleaseGeometry(geom_id); +} + +void BVHEmbree::pack_nodes(const BVHNode *) +{ + /* Quite a bit of this code is for compatibility with Cycles' native BVH. */ + if(!params.top_level) { + return; + } + + for(size_t i = 0; i < pack.prim_index.size(); ++i) { + if(pack.prim_index[i] != -1) { + if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE) + pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->curve_offset; + else + pack.prim_index[i] += objects[pack.prim_object[i]]->mesh->tri_offset; + } + } + + size_t prim_offset = pack.prim_index.size(); + + /* reserve */ + size_t prim_index_size = pack.prim_index.size(); + size_t prim_tri_verts_size = pack.prim_tri_verts.size(); + + size_t pack_prim_index_offset = prim_index_size; + size_t pack_prim_tri_verts_offset = prim_tri_verts_size; + size_t object_offset = 0; + + map<Mesh*, int> mesh_map; + + foreach(Object *ob, objects) { + Mesh *mesh = ob->mesh; + BVH *bvh = mesh->bvh; + + if(mesh->need_build_bvh()) { + if(mesh_map.find(mesh) == mesh_map.end()) { + prim_index_size += bvh->pack.prim_index.size(); + prim_tri_verts_size += bvh->pack.prim_tri_verts.size(); + mesh_map[mesh] = 1; + } + } + } + + mesh_map.clear(); + + pack.prim_index.resize(prim_index_size); + pack.prim_type.resize(prim_index_size); + pack.prim_object.resize(prim_index_size); + pack.prim_visibility.clear(); + pack.prim_tri_verts.resize(prim_tri_verts_size); + pack.prim_tri_index.resize(prim_index_size); + pack.object_node.resize(objects.size()); + + int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL; + int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL; + int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL; + float4 *pack_prim_tri_verts = (pack.prim_tri_verts.size())? &pack.prim_tri_verts[0]: NULL; + uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL; + + /* merge */ + foreach(Object *ob, objects) { + Mesh *mesh = ob->mesh; + + /* We assume that if mesh doesn't need own BVH it was already included + * into a top-level BVH and no packing here is needed. + */ + if(!mesh->need_build_bvh()) { + pack.object_node[object_offset++] = prim_offset; + continue; + } + + /* if mesh already added once, don't add it again, but used set + * node offset for this object */ + map<Mesh*, int>::iterator it = mesh_map.find(mesh); + + if(mesh_map.find(mesh) != mesh_map.end()) { + int noffset = it->second; + pack.object_node[object_offset++] = noffset; + continue; + } + + BVHEmbree *bvh = (BVHEmbree*)mesh->bvh; + + rtc_memory_monitor_func(stats, unaccounted_mem, true); + unaccounted_mem = 0; + + int mesh_tri_offset = mesh->tri_offset; + int mesh_curve_offset = mesh->curve_offset; + + /* fill in node indexes for instances */ + pack.object_node[object_offset++] = prim_offset; + + mesh_map[mesh] = pack.object_node[object_offset-1]; + + /* merge primitive, object and triangle indexes */ + if(bvh->pack.prim_index.size()) { + size_t bvh_prim_index_size = bvh->pack.prim_index.size(); + int *bvh_prim_index = &bvh->pack.prim_index[0]; + int *bvh_prim_type = &bvh->pack.prim_type[0]; + uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0]; + + for(size_t i = 0; i < bvh_prim_index_size; ++i) { + if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) { + pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_curve_offset; + pack_prim_tri_index[pack_prim_index_offset] = -1; + } + else { + pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + mesh_tri_offset; + pack_prim_tri_index[pack_prim_index_offset] = + bvh_prim_tri_index[i] + pack_prim_tri_verts_offset; + } + + pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i]; + pack_prim_object[pack_prim_index_offset] = 0; + + ++pack_prim_index_offset; + } + } + + /* Merge triangle vertices data. */ + if(bvh->pack.prim_tri_verts.size()) { + const size_t prim_tri_size = bvh->pack.prim_tri_verts.size(); + memcpy(pack_prim_tri_verts + pack_prim_tri_verts_offset, + &bvh->pack.prim_tri_verts[0], + prim_tri_size*sizeof(float4)); + pack_prim_tri_verts_offset += prim_tri_size; + } + + prim_offset += bvh->pack.prim_index.size(); + } +} + +void BVHEmbree::refit_nodes() +{ + /* Update all vertex buffers, then tell Embree to rebuild/-fit the BVHs. */ + unsigned geom_id = 0; + foreach(Object *ob, objects) { + if(!params.top_level || (ob->is_traceable() && !ob->mesh->is_instanced())) { + if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && ob->mesh->num_triangles() > 0) { + update_tri_vertex_buffer(rtcGetGeometry(scene, geom_id), ob->mesh); + rtcCommitGeometry(rtcGetGeometry(scene,geom_id)); + } + + if(params.primitive_mask & PRIMITIVE_ALL_CURVE && ob->mesh->num_curves() > 0) { + update_curve_vertex_buffer(rtcGetGeometry(scene, geom_id+1), ob->mesh); + rtcCommitGeometry(rtcGetGeometry(scene,geom_id+1)); + } + } + geom_id += 2; + } + rtcCommitScene(scene); +} +CCL_NAMESPACE_END + +#endif /* WITH_EMBREE */ diff --git a/intern/cycles/bvh/bvh_embree.h b/intern/cycles/bvh/bvh_embree.h new file mode 100644 index 00000000000..9990826ba98 --- /dev/null +++ b/intern/cycles/bvh/bvh_embree.h @@ -0,0 +1,79 @@ +/* + * Copyright 2018, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BVH_EMBREE_H__ +#define __BVH_EMBREE_H__ + +#ifdef WITH_EMBREE + +#include <embree3/rtcore.h> +#include <embree3/rtcore_scene.h> + +#include "bvh/bvh.h" +#include "bvh/bvh_params.h" + +#include "util/util_thread.h" +#include "util/util_types.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Mesh; + +class BVHEmbree : public BVH +{ +public: + virtual void build(Progress& progress, Stats *stats) override; + virtual ~BVHEmbree(); + RTCScene scene; + static void destroy(RTCScene); +protected: + friend class BVH; + BVHEmbree(const BVHParams& params, const vector<Object*>& objects); + + virtual void pack_nodes(const BVHNode*) override; + virtual void refit_nodes() override; + + void add_object(Object *ob, int i); + void add_instance(Object *ob, int i); + void add_curves(Object *ob, int i); + void add_triangles(Object *ob, int i); + + ssize_t mem_used; + + void add_delayed_delete_scene(RTCScene scene) { delayed_delete_scenes.push_back(scene); } + BVHEmbree *top_level; +private: + void delete_rtcScene(); + void update_tri_vertex_buffer(RTCGeometry geom_id, const Mesh* mesh); + void update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh* mesh); + + static RTCDevice rtc_shared_device; + static int rtc_shared_users; + static thread_mutex rtc_shared_mutex; + + Stats *stats; + vector<RTCScene> delayed_delete_scenes; + int curve_subdivisions; + enum RTCBuildQuality build_quality; + bool use_curves, use_ribbons, dynamic_scene; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_EMBREE */ + +#endif /* __BVH_EMBREE_H__ */ diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h index ed89d52a50a..65d5df01158 100644 --- a/intern/cycles/bvh/bvh_node.h +++ b/intern/cycles/bvh/bvh_node.h @@ -169,4 +169,4 @@ public: CCL_NAMESPACE_END -#endif /* __BVH_NODE_H__ */ +#endif /* __BVH_NODE_H__ */ diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h index d8dd7df6ba1..6408d56da80 100644 --- a/intern/cycles/bvh/bvh_params.h +++ b/intern/cycles/bvh/bvh_params.h @@ -90,6 +90,13 @@ public: /* Same as above, but for triangle primitives. */ int num_motion_triangle_steps; + /* Same as in SceneParams. */ + int bvh_type; + + /* These are needed for Embree. */ + int curve_flags; + int curve_subdivisions; + /* fixed parameters */ enum { MAX_DEPTH = 64, @@ -123,6 +130,11 @@ public: num_motion_curve_steps = 0; num_motion_triangle_steps = 0; + + bvh_type = 0; + + curve_flags = 0; + curve_subdivisions = 4; } /* SAH costs */ @@ -274,4 +286,4 @@ struct BVHSpatialStorage { CCL_NAMESPACE_END -#endif /* __BVH_PARAMS_H__ */ +#endif /* __BVH_PARAMS_H__ */ diff --git a/intern/cycles/bvh/bvh_sort.h b/intern/cycles/bvh/bvh_sort.h index 936401d8607..6910cc1e9b4 100644 --- a/intern/cycles/bvh/bvh_sort.h +++ b/intern/cycles/bvh/bvh_sort.h @@ -35,4 +35,4 @@ void bvh_reference_sort(int start, CCL_NAMESPACE_END -#endif /* __BVH_SORT_H__ */ +#endif /* __BVH_SORT_H__ */ diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h index a874a118b99..cb47deab211 100644 --- a/intern/cycles/bvh/bvh_split.h +++ b/intern/cycles/bvh/bvh_split.h @@ -259,4 +259,4 @@ public: CCL_NAMESPACE_END -#endif /* __BVH_SPLIT_H__ */ +#endif /* __BVH_SPLIT_H__ */ diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h index c3ece051cd5..bcfb6ed68da 100644 --- a/intern/cycles/bvh/bvh_unaligned.h +++ b/intern/cycles/bvh/bvh_unaligned.h @@ -77,4 +77,4 @@ protected: CCL_NAMESPACE_END -#endif /* __BVH_UNALIGNED_H__ */ +#endif /* __BVH_UNALIGNED_H__ */ diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index 2e386a6bfc5..d0f473a2939 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -133,6 +133,12 @@ if(CYCLES_STANDALONE_REPOSITORY) set(BOOST_DEFINITIONS "-DBOOST_ALL_NO_LIB") #### + # embree + if(WITH_CYCLES_EMBREE) + find_package(embree 3.2.4 REQUIRED) + endif() + + #### # Logging if(WITH_CYCLES_LOGGING) find_package(Glog REQUIRED) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 6959dd73c32..7e20bb449c3 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -361,7 +361,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th info.has_half_images = true; info.has_volume_decoupled = true; - info.bvh_layout_mask = BVH_LAYOUT_ALL; info.has_osl = true; foreach(const DeviceInfo &device, subdevices) { @@ -396,7 +395,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th /* Accumulate device info. */ info.has_half_images &= device.has_half_images; info.has_volume_decoupled &= device.has_volume_decoupled; - info.bvh_layout_mask = device.bvh_layout_mask & info.bvh_layout_mask; info.has_osl &= device.has_osl; } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 2400788c833..f3fb338e638 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -58,7 +58,6 @@ public: bool advanced_shading; /* Supports full shading system. */ bool has_half_images; /* Support half-float textures. */ bool has_volume_decoupled; /* Decoupled volume shading. */ - BVHLayoutMask bvh_layout_mask; /* Bitmask of supported BVH layouts. */ bool has_osl; /* Support Open Shading Language. */ bool use_split_kernel; /* Use split or mega kernel. */ int cpu_threads; @@ -74,7 +73,6 @@ public: advanced_shading = true; has_half_images = false; has_volume_decoupled = false; - bvh_layout_mask = BVH_LAYOUT_NONE; has_osl = false; use_split_kernel = false; } @@ -183,7 +181,7 @@ public: /* Convert the requested features structure to a build options, * which could then be passed to compilers. */ - string get_build_options(void) const + string get_build_options() const { string build_options = ""; if(experimental) { @@ -242,8 +240,8 @@ std::ostream& operator <<(std::ostream &os, /* Device */ struct DeviceDrawParams { - function<void(void)> bind_display_space_shader_cb; - function<void(void)> unbind_display_space_shader_cb; + function<void()> bind_display_space_shader_cb; + function<void()> unbind_display_space_shader_cb; }; class Device { @@ -281,6 +279,7 @@ public: fflush(stderr); } virtual bool show_samples() const { return false; } + virtual BVHLayoutMask get_bvh_layout_mask() const = 0; /* statistics */ Stats &stats; @@ -361,4 +360,4 @@ private: CCL_NAMESPACE_END -#endif /* __DEVICE_H__ */ +#endif /* __DEVICE_H__ */ diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 7c72ab1a009..76f6466bbde 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -41,6 +41,7 @@ #include "kernel/osl/osl_globals.h" #include "render/buffers.h" +#include "render/coverage.h" #include "util/util_debug.h" #include "util/util_foreach.h" @@ -80,11 +81,11 @@ public: /* Silence potential warnings about unused variables * when compiling without some architectures. */ - (void)kernel_sse2; - (void)kernel_sse3; - (void)kernel_sse41; - (void)kernel_avx; - (void)kernel_avx2; + (void) kernel_sse2; + (void) kernel_sse3; + (void) kernel_sse41; + (void) kernel_avx; + (void) kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { architecture_name = "AVX2"; @@ -184,11 +185,11 @@ public: KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; - KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; + KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; @@ -277,6 +278,20 @@ public: return (info.cpu_threads == 1); } + virtual BVHLayoutMask get_bvh_layout_mask() const { + BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; + if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { + bvh_layout_mask |= BVH_LAYOUT_BVH4; + } + if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { + bvh_layout_mask |= BVH_LAYOUT_BVH8; + } +#ifdef WITH_EMBREE + bvh_layout_mask |= BVH_LAYOUT_EMBREE; +#endif /* WITH_EMBREE */ + return bvh_layout_mask; + } + void load_texture_info() { if(need_texture_info) { @@ -499,6 +514,7 @@ public: filter_nlm_update_output_kernel()(dx, dy, blurDifference, (float*) image_ptr, + difference, (float*) out_ptr, weightAccum, local_rect, @@ -676,12 +692,22 @@ public: void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) { + const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; + scoped_timer timer(&tile.buffers->render_time); + Coverage coverage(kg, tile); + if(use_coverage) { + coverage.init_path_trace(); + } + float *render_buffer = (float*)tile.buffer; int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + for(int sample = start_sample; sample < end_sample; sample++) { if(task.get_cancel() || task_pool.canceled()) { if(task.need_finish_queue == false) @@ -690,6 +716,9 @@ public: for(int y = tile.y; y < tile.y + tile.h; y++) { for(int x = tile.x; x < tile.x + tile.w; x++) { + if(use_coverage) { + coverage.init_pixel(x, y); + } path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); } @@ -699,6 +728,9 @@ public: task.update_progress(&tile, tile.w*tile.h); } + if(use_coverage) { + coverage.finalize(); + } } void denoise(DenoisingTask& denoising, RenderTile &tile) @@ -759,7 +791,6 @@ public: } else if(tile.task == RenderTile::DENOISE) { denoise(denoising, tile); - task.update_progress(&tile, tile.w*tile.h); } @@ -1027,13 +1058,6 @@ void device_cpu_info(vector<DeviceInfo>& devices) info.id = "CPU"; info.num = 0; info.advanced_shading = true; - info.bvh_layout_mask = BVH_LAYOUT_BVH2; - if(system_cpu_support_sse2()) { - info.bvh_layout_mask |= BVH_LAYOUT_BVH4; - } - if(system_cpu_support_avx2()) { - info.bvh_layout_mask |= BVH_LAYOUT_BVH8; - } info.has_volume_decoupled = true; info.has_osl = true; info.has_half_images = true; @@ -1041,7 +1065,7 @@ void device_cpu_info(vector<DeviceInfo>& devices) devices.insert(devices.begin(), info); } -string device_cpu_capabilities(void) +string device_cpu_capabilities() { string capabilities = ""; capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index da8e49f129f..46e7b043603 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -73,12 +73,12 @@ const char *cuewErrorString(CUresult result) return error.c_str(); } -const char *cuewCompilerPath(void) +const char *cuewCompilerPath() { return CYCLES_CUDA_NVCC_EXECUTABLE; } -int cuewCompilerVersion(void) +int cuewCompilerVersion() { return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); } @@ -181,6 +181,10 @@ public: return true; } + virtual BVHLayoutMask get_bvh_layout_mask() const { + return BVH_LAYOUT_BVH2; + } + /*#ifdef NDEBUG #define cuda_abort() #else @@ -207,7 +211,7 @@ public: /*cuda_abort();*/ \ cuda_error_documentation(); \ } \ - } (void)0 + } (void) 0 bool cuda_error_(CUresult result, const string& stmt) { @@ -1397,18 +1401,14 @@ public: int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; - int shift_stride = stride*h; + int pass_stride = task->buffer.pass_stride; int num_shifts = (2*r+1)*(2*r+1); - int mem_size = sizeof(float)*shift_stride*num_shifts; - - device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem"); - temporary_mem.alloc_to_device(2*mem_size); if(have_error()) return false; - CUdeviceptr difference = cuda_device_ptr(temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + mem_size; + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; { CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; @@ -1426,9 +1426,9 @@ public: task->reconstruction_state.source_w * task->reconstruction_state.source_h, num_shifts); - void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; void *construct_gramian_args[] = {&blurDifference, &task->buffer.mem.device_pointer, &task->storage.transform.device_pointer, @@ -1437,9 +1437,8 @@ public: &task->storage.XtWY.device_pointer, &task->reconstruction_state.filter_window, &w, &h, &stride, - &shift_stride, &r, - &f, - &task->buffer.pass_stride}; + &pass_stride, &r, + &f}; CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); @@ -1448,8 +1447,6 @@ public: CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); } - temporary_mem.free(); - { CUfunction cuFinalize; cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); @@ -1667,7 +1664,7 @@ public: for(int sample = start_sample; sample < end_sample; sample += step_samples) { /* Setup and copy work tile to device. */ wtile->start_sample = sample; - wtile->num_samples = min(step_samples, end_sample - sample);; + wtile->num_samples = min(step_samples, end_sample - sample); work_tiles.copy_to_device(); CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); @@ -2149,7 +2146,7 @@ public: /*cuda_abort();*/ \ device->cuda_error_documentation(); \ } \ - } (void)0 + } (void) 0 /* CUDA context scope. */ @@ -2358,7 +2355,7 @@ int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& return global_size; } -bool device_cuda_init(void) +bool device_cuda_init() { #ifdef WITH_CUDA_DYNLOAD static bool initialized = false; @@ -2396,7 +2393,7 @@ bool device_cuda_init(void) return result; #else /* WITH_CUDA_DYNLOAD */ return true; -#endif /* WITH_CUDA_DYNLOAD */ +#endif /* WITH_CUDA_DYNLOAD */ } Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background) @@ -2466,7 +2463,6 @@ void device_cuda_info(vector<DeviceInfo>& devices) info.advanced_shading = (major >= 3); info.has_half_images = (major >= 3); info.has_volume_decoupled = false; - info.bvh_layout_mask = BVH_LAYOUT_BVH2; int pci_location[3] = {0, 0, 0}; cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); @@ -2501,7 +2497,7 @@ void device_cuda_info(vector<DeviceInfo>& devices) devices.insert(devices.end(), display_devices.begin(), display_devices.end()); } -string device_cuda_capabilities(void) +string device_cuda_capabilities() { CUresult result = device_cuda_safe_init(); if(result != CUDA_SUCCESS) { @@ -2534,7 +2530,7 @@ string device_cuda_capabilities(void) capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \ value); \ } \ - } (void)0 + } (void) 0 /* TODO(sergey): Strip all attributes which are not useful for us * or does not depend on the driver. */ diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 23c18fa15b2..78c65a3d22d 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -99,14 +99,18 @@ void DenoisingTask::setup_denoising_buffer() buffer.mem.alloc_to_device(mem_size, false); /* CPUs process shifts sequentially while GPUs process them in parallel. */ - int num_shifts = 1; + int num_layers; if(buffer.gpu_temporary_mem) { /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */ int max_radius = max(radius, 6); - num_shifts = (2*max_radius + 1) * (2*max_radius + 1); + int num_shifts = (2*max_radius + 1) * (2*max_radius + 1); + num_layers = 2*num_shifts + 1; + } + else { + num_layers = 3; } /* Allocate two layers per shift as well as one for the weight accumulation. */ - buffer.temporary_mem.alloc_to_device((2*num_shifts + 1) * buffer.pass_stride); + buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride); } void DenoisingTask::prefilter_shadowing() diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index 7474f71ff78..8e0666d0e59 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -166,4 +166,4 @@ protected: CCL_NAMESPACE_END -#endif /* __DEVICE_DENOISING_H__ */ +#endif /* __DEVICE_DENOISING_H__ */ diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h index 941be448101..e6495c2bff3 100644 --- a/intern/cycles/device/device_intern.h +++ b/intern/cycles/device/device_intern.h @@ -22,9 +22,9 @@ CCL_NAMESPACE_BEGIN class Device; Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background); -bool device_opencl_init(void); +bool device_opencl_init(); Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background); -bool device_cuda_init(void); +bool device_cuda_init(); Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background); Device *device_network_create(DeviceInfo& info, Stats &stats, const char *address); Device *device_multi_create(DeviceInfo& info, Stats &stats, bool background); @@ -34,10 +34,10 @@ void device_opencl_info(vector<DeviceInfo>& devices); void device_cuda_info(vector<DeviceInfo>& devices); void device_network_info(vector<DeviceInfo>& devices); -string device_cpu_capabilities(void); -string device_opencl_capabilities(void); -string device_cuda_capabilities(void); +string device_cpu_capabilities(); +string device_opencl_capabilities(); +string device_cuda_capabilities(); CCL_NAMESPACE_END -#endif /* __DEVICE_INTERN_H__ */ +#endif /* __DEVICE_INTERN_H__ */ diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 2b4835c9c65..e43834bdc8d 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -21,6 +21,7 @@ * * Data types for allocating, copying and freeing device memory. */ +#include "util/util_array.h" #include "util/util_half.h" #include "util/util_texture.h" #include "util/util_types.h" @@ -496,4 +497,4 @@ protected: CCL_NAMESPACE_END -#endif /* __DEVICE_MEMORY_H__ */ +#endif /* __DEVICE_MEMORY_H__ */ diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index f1bd3fd13e1..490ee3951c9 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -103,6 +103,14 @@ public: return devices.front().device->show_samples(); } + virtual BVHLayoutMask get_bvh_layout_mask() const { + BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL; + foreach(const SubDevice& sub_device, devices) { + bvh_layout_mask &= sub_device.device->get_bvh_layout_mask(); + } + return bvh_layout_mask; + } + bool load_kernels(const DeviceRequestedFeatures& requested_features) { foreach(SubDevice& sub, devices) diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 204e405421d..b6e18621f12 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -87,6 +87,10 @@ public: snd.write(); } + virtual BVHLayoutMask get_bvh_layout_mask() const { + return BVH_LAYOUT_BVH2; + } + void mem_alloc(device_memory& mem) { if(mem.name) { @@ -306,7 +310,6 @@ void device_network_info(vector<DeviceInfo>& devices) /* todo: get this info from device */ info.advanced_shading = true; info.has_volume_decoupled = false; - info.bvh_layout_mask = BVH_LAYOUT_BVH2; info.has_osl = false; devices.push_back(info); diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h index 96e0de742db..67626ae177f 100644 --- a/intern/cycles/device/device_network.h +++ b/intern/cycles/device/device_network.h @@ -488,4 +488,4 @@ CCL_NAMESPACE_END #endif -#endif /* __DEVICE_NETWORK_H__ */ +#endif /* __DEVICE_NETWORK_H__ */ diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index be0f8f45399..71410f80d57 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -44,7 +44,7 @@ Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background) } } -bool device_opencl_init(void) +bool device_opencl_init() { static bool initialized = false; static bool result = false; @@ -136,7 +136,6 @@ void device_opencl_info(vector<DeviceInfo>& devices) info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name, device_type); info.has_volume_decoupled = false; - info.bvh_layout_mask = BVH_LAYOUT_BVH2; info.id = id; /* Check OpenCL extensions */ @@ -147,7 +146,7 @@ void device_opencl_info(vector<DeviceInfo>& devices) } } -string device_opencl_capabilities(void) +string device_opencl_capabilities() { if(OpenCLInfo::device_type() == 0) { return "All OpenCL devices are forced to be OFF"; @@ -246,4 +245,4 @@ string device_opencl_capabilities(void) CCL_NAMESPACE_END -#endif /* WITH_OPENCL */ +#endif /* WITH_OPENCL */ diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 26ddce5bb22..5af4367d1b6 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -130,4 +130,4 @@ public: CCL_NAMESPACE_END -#endif /* __DEVICE_SPLIT_KERNEL_H__ */ +#endif /* __DEVICE_SPLIT_KERNEL_H__ */ diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index ec87aa8c560..861014373b3 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -64,7 +64,7 @@ public: function<void(long, int)> update_progress_sample; function<void(RenderTile&)> update_tile_sample; function<void(RenderTile&)> release_tile; - function<bool(void)> get_cancel; + function<bool()> get_cancel; function<void(RenderTile*, Device*)> map_neighbor_tiles; function<void(RenderTile*, Device*)> unmap_neighbor_tiles; @@ -85,4 +85,4 @@ protected: CCL_NAMESPACE_END -#endif /* __DEVICE_TASK_H__ */ +#endif /* __DEVICE_TASK_H__ */ diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 6c73d10a376..8cb7f6d0b82 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -245,7 +245,7 @@ public: (device)->set_error(message); \ fprintf(stderr, "%s\n", message.c_str()); \ } \ - } (void)0 + } (void) 0 #define opencl_assert(stmt) \ { \ @@ -257,7 +257,7 @@ public: error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ } \ - } (void)0 + } (void) 0 class OpenCLDeviceBase : public Device { diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index cc887134bb0..1e73d37d7a4 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -761,7 +761,7 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, cl_mem variance_mem = CL_MEM_PTR(variance_ptr); cl_mem out_mem = CL_MEM_PTR(out_ptr); - mem_zero_kernel(*difference, sizeof(float)*pass_stride); + mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride); mem_zero_kernel(out_ptr, sizeof(float)*pass_stride); cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); @@ -865,38 +865,38 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, int h = task->reconstruction_state.source_h; int stride = task->buffer.stride; - int shift_stride = stride*h; - int num_shifts = (2*task->radius + 1)*(2*task->radius + 1); - int mem_size = sizeof(float)*shift_stride*num_shifts; + int r = task->radius; + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2*r+1)*(2*r+1); - cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); - cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); + device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); kernel_set_args(ckNLMCalcDifference, 0, color_mem, color_variance_mem, - difference, + difference_mem, w, h, stride, - shift_stride, - task->radius, - task->buffer.pass_stride, + pass_stride, + r, + pass_stride, 1.0f, task->nlm_k_2); kernel_set_args(ckNLMBlur, 0, - difference, - blurDifference, + difference_mem, + blurDifference_mem, w, h, stride, - shift_stride, - task->radius, 4); + pass_stride, + r, 4); kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, - difference, + blurDifference_mem, + difference_mem, w, h, stride, - shift_stride, - task->radius, 4); + pass_stride, + r, 4); kernel_set_args(ckNLMConstructGramian, 0, - blurDifference, + blurDifference_mem, buffer_mem, transform_mem, rank_mem, @@ -904,9 +904,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, XtWY_mem, task->reconstruction_state.filter_window, w, h, stride, - shift_stride, - task->radius, 4, - task->buffer.pass_stride); + pass_stride, + r, 4); enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); @@ -914,9 +913,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); - opencl_assert(clReleaseMemObject(difference)); - opencl_assert(clReleaseMemObject(blurDifference)); - kernel_set_args(ckFinalize, 0, output_mem, rank_mem, diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index e004c0b44f4..89001366d9d 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -43,6 +43,10 @@ public: return true; } + virtual BVHLayoutMask get_bvh_layout_mask() const { + return BVH_LAYOUT_BVH2; + } + virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/, vector<OpenCLProgram*> &programs) { diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 66a4aa7e891..adb73bc6e2c 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -95,6 +95,10 @@ public: return true; } + virtual BVHLayoutMask get_bvh_layout_mask() const { + return BVH_LAYOUT_BVH2; + } + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, vector<OpenCLDeviceBase::OpenCLProgram*> &programs) { @@ -459,4 +463,4 @@ Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool backgrou CCL_NAMESPACE_END -#endif /* WITH_OPENCL */ +#endif /* WITH_OPENCL */ diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 895e4149a3a..4c9f3cd6ef7 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -106,7 +106,7 @@ cl_context OpenCLCache::get_context(cl_platform_id platform, cl_int ciErr = clRetainContext(slot.context); assert(ciErr == CL_SUCCESS); - (void)ciErr; + (void) ciErr; return slot.context; } @@ -153,7 +153,7 @@ cl_program OpenCLCache::get_program(cl_platform_id platform, cl_int ciErr = clRetainProgram(entry.program); assert(ciErr == CL_SUCCESS); - (void)ciErr; + (void) ciErr; return entry.program; } @@ -188,7 +188,7 @@ void OpenCLCache::store_context(cl_platform_id platform, * The caller is going to release the object when done with it. */ cl_int ciErr = clRetainContext(context); assert(ciErr == CL_SUCCESS); - (void)ciErr; + (void) ciErr; } void OpenCLCache::store_program(cl_platform_id platform, @@ -227,7 +227,7 @@ void OpenCLCache::store_program(cl_platform_id platform, */ cl_int ciErr = clRetainProgram(program); assert(ciErr == CL_SUCCESS); - (void)ciErr; + (void) ciErr; } string OpenCLCache::get_kernel_md5() diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h index 11695a8631d..d50a3786139 100644 --- a/intern/cycles/graph/node.h +++ b/intern/cycles/graph/node.h @@ -18,9 +18,9 @@ #include "graph/node_type.h" +#include "util/util_array.h" #include "util/util_map.h" #include "util/util_param.h" -#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/graph/node_type.h b/intern/cycles/graph/node_type.h index 1d565794b27..7d6abae2314 100644 --- a/intern/cycles/graph/node_type.h +++ b/intern/cycles/graph/node_type.h @@ -17,7 +17,7 @@ #pragma once #include "graph/node_enum.h" - +#include "util/util_array.h" #include "util/util_map.h" #include "util/util_param.h" #include "util/util_string.h" diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index c6e92c6d89d..92cb66bdec9 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -82,6 +82,7 @@ set(SRC_BVH_HEADERS bvh/obvh_traversal.h bvh/obvh_volume.h bvh/obvh_volume_all.h + bvh/bvh_embree.h ) set(SRC_HEADERS @@ -96,6 +97,7 @@ set(SRC_HEADERS kernel_emission.h kernel_film.h kernel_globals.h + kernel_id_passes.h kernel_jitter.h kernel_light.h kernel_math.h @@ -340,11 +342,11 @@ if(WITH_CYCLES_CUDA_BINARIES) set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}") # warn for other versions - if(CUDA_VERSION MATCHES "80" OR CUDA_VERSION MATCHES "90") + if(CUDA_VERSION MATCHES "90" OR CUDA_VERSION MATCHES "91") else() message(WARNING "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, " - "build may succeed but only CUDA 8.0 is officially supported") + "build may succeed but only CUDA 9.0 and 9.1 are officially supported") endif() # build for each arch diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 2ad55d041bf..6708a3efac1 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -25,6 +25,10 @@ * the code has been extended and modified to support more primitives and work * with CPU/CUDA/OpenCL. */ +#ifdef __EMBREE__ +# include "kernel/bvh/bvh_embree.h" +#endif + CCL_NAMESPACE_BEGIN #include "kernel/bvh/bvh_types.h" @@ -32,9 +36,9 @@ CCL_NAMESPACE_BEGIN /* Common QBVH functions. */ #ifdef __QBVH__ # include "kernel/bvh/qbvh_nodes.h" -#ifdef __KERNEL_AVX2__ -# include "kernel/bvh/obvh_nodes.h" -#endif +# ifdef __KERNEL_AVX2__ +# include "kernel/bvh/obvh_nodes.h" +# endif #endif /* Regular BVH traversal */ @@ -160,6 +164,19 @@ CCL_NAMESPACE_BEGIN #undef BVH_NAME_EVAL #undef BVH_FUNCTION_FULL_NAME +ccl_device_inline bool scene_intersect_valid(const Ray *ray) +{ + /* NOTE: Due to some vectorization code non-finite origin point might + * cause lots of false-positive intersections which will overflow traversal + * stack. + * This code is a quick way to perform early output, to avoid crashes in + * such cases. + * From production scenes so far it seems it's enough to test first element + * only. + */ + return isfinite(ray->P.x); +} + /* Note: ray is passed by value to work around a possible CUDA compiler bug. */ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray ray, @@ -169,39 +186,57 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, float difl, float extmax) { + if(!scene_intersect_valid(&ray)) { + return false; + } +#ifdef __EMBREE__ + if(kernel_data.bvh.scene) { + isect->t = ray.t; + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR); + IntersectContext rtc_ctx(&ctx); + RTCRayHit ray_hit; + kernel_embree_setup_rayhit(ray, ray_hit, visibility); + rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit); + if(ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID && ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) { + kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect); + return true; + } + return false; + } +#endif /* __EMBREE__ */ #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { # ifdef __HAIR__ if(kernel_data.bvh.have_curves) return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax); -# endif /* __HAIR__ */ +# endif /* __HAIR__ */ return bvh_intersect_motion(kg, &ray, isect, visibility); } -#endif /* __OBJECT_MOTION__ */ +#endif /* __OBJECT_MOTION__ */ #ifdef __HAIR__ if(kernel_data.bvh.have_curves) return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax); -#endif /* __HAIR__ */ +#endif /* __HAIR__ */ #ifdef __KERNEL_CPU__ # ifdef __INSTANCING__ if(kernel_data.bvh.have_instancing) return bvh_intersect_instancing(kg, &ray, isect, visibility); -# endif /* __INSTANCING__ */ +# endif /* __INSTANCING__ */ return bvh_intersect(kg, &ray, isect, visibility); -#else /* __KERNEL_CPU__ */ +#else /* __KERNEL_CPU__ */ # ifdef __INSTANCING__ return bvh_intersect_instancing(kg, &ray, isect, visibility); # else return bvh_intersect(kg, &ray, isect, visibility); -# endif /* __INSTANCING__ */ +# endif /* __INSTANCING__ */ -#endif /* __KERNEL_CPU__ */ +#endif /* __KERNEL_CPU__ */ } #ifdef __BVH_LOCAL__ @@ -213,6 +248,58 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg, uint *lcg_state, int max_hits) { + if(!scene_intersect_valid(&ray)) { + return false; + } +#ifdef __EMBREE__ + if(kernel_data.bvh.scene) { + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SSS); + ctx.lcg_state = lcg_state; + ctx.max_hits = max_hits; + ctx.ss_isect = local_isect; + local_isect->num_hits = 0; + ctx.sss_object_id = local_object; + IntersectContext rtc_ctx(&ctx); + RTCRay rtc_ray; + kernel_embree_setup_ray(ray, rtc_ray, PATH_RAY_ALL_VISIBILITY); + + /* Get the Embree scene for this intersection. */ + RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2); + if(geom) { + float3 P = ray.P; + float3 dir = ray.D; + float3 idir = ray.D; + const int object_flag = kernel_tex_fetch(__object_flag, local_object); + if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + Transform ob_itfm; + rtc_ray.tfar = bvh_instance_motion_push(kg, + local_object, + &ray, + &P, + &dir, + &idir, + ray.t, + &ob_itfm); + /* bvh_instance_motion_push() returns the inverse transform but + * it's not needed here. */ + (void) ob_itfm; + + rtc_ray.org_x = P.x; + rtc_ray.org_y = P.y; + rtc_ray.org_z = P.z; + rtc_ray.dir_x = dir.x; + rtc_ray.dir_y = dir.y; + rtc_ray.dir_z = dir.z; + } + RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom); + if(scene) { + rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray); + } + } + + return local_isect->num_hits > 0; + } +#endif /* __EMBREE__ */ #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { return bvh_intersect_local_motion(kg, @@ -222,7 +309,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg, lcg_state, max_hits); } -#endif /* __OBJECT_MOTION__ */ +#endif /* __OBJECT_MOTION__ */ return bvh_intersect_local(kg, &ray, local_isect, @@ -240,6 +327,27 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, uint max_hits, uint *num_hits) { + if(!scene_intersect_valid(ray)) { + return false; + } +# ifdef __EMBREE__ + if(kernel_data.bvh.scene) { + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL); + ctx.isect_s = isect; + ctx.max_hits = max_hits; + ctx.num_hits = 0; + IntersectContext rtc_ctx(&ctx); + RTCRay rtc_ray; + kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_SHADOW); + rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); + + if(ctx.num_hits > max_hits) { + return true; + } + *num_hits = ctx.num_hits; + return rtc_ray.tfar == -INFINITY; + } +# endif # ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { # ifdef __HAIR__ @@ -251,7 +359,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, max_hits, num_hits); } -# endif /* __HAIR__ */ +# endif /* __HAIR__ */ return bvh_intersect_shadow_all_motion(kg, ray, @@ -260,7 +368,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, max_hits, num_hits); } -# endif /* __OBJECT_MOTION__ */ +# endif /* __OBJECT_MOTION__ */ # ifdef __HAIR__ if(kernel_data.bvh.have_curves) { @@ -271,7 +379,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, max_hits, num_hits); } -# endif /* __HAIR__ */ +# endif /* __HAIR__ */ # ifdef __INSTANCING__ if(kernel_data.bvh.have_instancing) { @@ -282,7 +390,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, max_hits, num_hits); } -# endif /* __INSTANCING__ */ +# endif /* __INSTANCING__ */ return bvh_intersect_shadow_all(kg, ray, @@ -299,24 +407,27 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg, Intersection *isect, const uint visibility) { + if(!scene_intersect_valid(ray)) { + return false; + } # ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { return bvh_intersect_volume_motion(kg, ray, isect, visibility); } -# endif /* __OBJECT_MOTION__ */ +# endif /* __OBJECT_MOTION__ */ # ifdef __KERNEL_CPU__ # ifdef __INSTANCING__ if(kernel_data.bvh.have_instancing) return bvh_intersect_volume_instancing(kg, ray, isect, visibility); -# endif /* __INSTANCING__ */ +# endif /* __INSTANCING__ */ return bvh_intersect_volume(kg, ray, isect, visibility); -# else /* __KERNEL_CPU__ */ +# else /* __KERNEL_CPU__ */ # ifdef __INSTANCING__ return bvh_intersect_volume_instancing(kg, ray, isect, visibility); # else return bvh_intersect_volume(kg, ray, isect, visibility); -# endif /* __INSTANCING__ */ -# endif /* __KERNEL_CPU__ */ +# endif /* __INSTANCING__ */ +# endif /* __KERNEL_CPU__ */ } #endif /* __VOLUME__ */ @@ -327,15 +438,31 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg, const uint max_hits, const uint visibility) { + if(!scene_intersect_valid(ray)) { + return false; + } +# ifdef __EMBREE__ + if(kernel_data.bvh.scene) { + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL); + ctx.isect_s = isect; + ctx.max_hits = max_hits; + ctx.num_hits = 0; + IntersectContext rtc_ctx(&ctx); + RTCRay rtc_ray; + kernel_embree_setup_ray(*ray, rtc_ray, visibility); + rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray); + return rtc_ray.tfar == -INFINITY; + } +# endif # ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility); } -# endif /* __OBJECT_MOTION__ */ +# endif /* __OBJECT_MOTION__ */ # ifdef __INSTANCING__ if(kernel_data.bvh.have_instancing) return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits, visibility); -# endif /* __INSTANCING__ */ +# endif /* __INSTANCING__ */ return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility); } #endif /* __VOLUME_RECORD_ALL__ */ diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h new file mode 100644 index 00000000000..34a099ebb4d --- /dev/null +++ b/intern/cycles/kernel/bvh/bvh_embree.h @@ -0,0 +1,126 @@ +/* + * Copyright 2018, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <embree3/rtcore_ray.h> +#include <embree3/rtcore_scene.h> + +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +struct CCLIntersectContext { + typedef enum { + RAY_REGULAR = 0, + RAY_SHADOW_ALL = 1, + RAY_SSS = 2, + RAY_VOLUME_ALL = 3, + + } RayType; + + KernelGlobals *kg; + RayType type; + + /* for shadow rays */ + Intersection *isect_s; + int max_hits; + int num_hits; + + /* for SSS Rays: */ + LocalIntersection *ss_isect; + int sss_object_id; + uint *lcg_state; + + CCLIntersectContext(KernelGlobals *kg_, RayType type_) + { + kg = kg_; + type = type_; + max_hits = 1; + num_hits = 0; + isect_s = NULL; + ss_isect = NULL; + sss_object_id = -1; + lcg_state = NULL; + } +}; + +class IntersectContext +{ +public: + IntersectContext(CCLIntersectContext* ctx) + { + rtcInitIntersectContext(&context); + userRayExt = ctx; + } + RTCIntersectContext context; + CCLIntersectContext* userRayExt; +}; + +ccl_device_inline void kernel_embree_setup_ray(const Ray& ray, RTCRay& rtc_ray, const uint visibility) +{ + rtc_ray.org_x = ray.P.x; + rtc_ray.org_y = ray.P.y; + rtc_ray.org_z = ray.P.z; + rtc_ray.dir_x = ray.D.x; + rtc_ray.dir_y = ray.D.y; + rtc_ray.dir_z = ray.D.z; + rtc_ray.tnear = 0.0f; + rtc_ray.tfar = ray.t; + rtc_ray.time = ray.time; + rtc_ray.mask = visibility; +} + +ccl_device_inline void kernel_embree_setup_rayhit(const Ray& ray, RTCRayHit& rayhit, const uint visibility) +{ + kernel_embree_setup_ray(ray, rayhit.ray, visibility); + rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID; + rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID; +} + +ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect) +{ + bool is_hair = hit->geomID & 1; + isect->u = is_hair ? hit->u : 1.0f - hit->v - hit->u; + isect->v = is_hair ? hit->v : hit->u; + isect->t = ray->tfar; + isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z); + if(hit->instID[0] != RTC_INVALID_GEOMETRY_ID) { + RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0])); + isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + kernel_tex_fetch(__object_node, hit->instID[0]/2); + isect->object = hit->instID[0]/2; + } + else { + isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, hit->geomID)); + isect->object = OBJECT_NONE; + } + isect->type = kernel_tex_fetch(__prim_type, isect->prim); +} + +ccl_device_inline void kernel_embree_convert_local_hit(KernelGlobals *kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int local_object_id) +{ + isect->u = 1.0f - hit->v - hit->u; + isect->v = hit->u; + isect->t = ray->tfar; + isect->Ng = make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z); + RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.bvh.scene, local_object_id * 2)); + isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)) + kernel_tex_fetch(__object_node, local_object_id); + isect->object = local_object_id; + isect->type = kernel_tex_fetch(__prim_type, isect->prim); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h index 2b02f4527bb..8364bc3aa9a 100644 --- a/intern/cycles/kernel/bvh/bvh_local.h +++ b/intern/cycles/kernel/bvh/bvh_local.h @@ -136,7 +136,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, node_addr, PATH_RAY_ALL_VISIBILITY, dist); -#else // __KERNEL_SSE2__ +#else // __KERNEL_SSE2__ traverse_mask = NODE_INTERSECT(kg, P, dir, @@ -151,7 +151,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, node_addr, PATH_RAY_ALL_VISIBILITY, dist); -#endif // __KERNEL_SSE2__ +#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index d525b29fd94..64eb2f3f659 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -124,7 +124,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, node_addr, visibility, dist); -#else // __KERNEL_SSE2__ +#else // __KERNEL_SSE2__ traverse_mask = NODE_INTERSECT(kg, P, dir, @@ -139,7 +139,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, node_addr, visibility, dist); -#endif // __KERNEL_SSE2__ +#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index e95d2408201..af9f04db0ba 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -146,7 +146,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, visibility, dist); } -#else // __KERNEL_SSE2__ +#else // __KERNEL_SSE2__ # if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) if(difl != 0.0f) { traverse_mask = NODE_INTERSECT_ROBUST(kg, @@ -184,7 +184,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, visibility, dist); } -#endif // __KERNEL_SSE2__ +#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index 7d03855cb8f..12d4c5eb94a 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -120,7 +120,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, node_addr, visibility, dist); -#else // __KERNEL_SSE2__ +#else // __KERNEL_SSE2__ traverse_mask = NODE_INTERSECT(kg, P, dir, @@ -135,7 +135,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, node_addr, visibility, dist); -#endif // __KERNEL_SSE2__ +#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index 3d9b598914f..6205b9bcf7a 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -124,7 +124,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, node_addr, visibility, dist); -#else // __KERNEL_SSE2__ +#else // __KERNEL_SSE2__ traverse_mask = NODE_INTERSECT(kg, P, dir, @@ -139,7 +139,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, node_addr, visibility, dist); -#endif // __KERNEL_SSE2__ +#endif // __KERNEL_SSE2__ node_addr = __float_as_int(cnodes.z); node_addr_child1 = __float_as_int(cnodes.w); diff --git a/intern/cycles/kernel/bvh/obvh_local.h b/intern/cycles/kernel/bvh/obvh_local.h index 92143193a6a..eb24a607caa 100644 --- a/intern/cycles/kernel/bvh/obvh_local.h +++ b/intern/cycles/kernel/bvh/obvh_local.h @@ -73,12 +73,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, object = local_object; } -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif - avxf tnear(0.0f), tfar(isect_t); #if BVH_FEATURE(BVH_HAIR) avx3f dir4(avxf(dir.x), avxf(dir.y), avxf(dir.z)); diff --git a/intern/cycles/kernel/bvh/obvh_shadow_all.h b/intern/cycles/kernel/bvh/obvh_shadow_all.h index 3e877065127..8b739b3438a 100644 --- a/intern/cycles/kernel/bvh/obvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/obvh_shadow_all.h @@ -66,12 +66,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, *num_hits = 0; isect_array->t = tmax; -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif - #if BVH_FEATURE(BVH_INSTANCING) int num_hits_in_instance = 0; #endif @@ -103,7 +97,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - (void)inodes; + (void) inodes; if(false #ifdef __VISIBILITY_FLAG__ diff --git a/intern/cycles/kernel/bvh/obvh_traversal.h b/intern/cycles/kernel/bvh/obvh_traversal.h index 2021d8e1143..6bb19eb1ed9 100644 --- a/intern/cycles/kernel/bvh/obvh_traversal.h +++ b/intern/cycles/kernel/bvh/obvh_traversal.h @@ -64,12 +64,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, Transform ob_itfm; #endif -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif - isect->t = ray->t; isect->u = 0.0f; isect->v = 0.0f; @@ -103,7 +97,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - (void)inodes; + (void) inodes; if(UNLIKELY(node_dist > isect->t) #if BVH_FEATURE(BVH_MOTION) @@ -179,7 +173,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, avxf cnodes; /* TODO(sergey): Investigate whether moving cnodes upwards * gives a speedup (will be different cache pattern but will - * avoid extra check here), + * avoid extra check here). */ #if BVH_FEATURE(BVH_HAIR) if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { diff --git a/intern/cycles/kernel/bvh/obvh_volume.h b/intern/cycles/kernel/bvh/obvh_volume.h index da9ddbd4f24..80d09c59039 100644 --- a/intern/cycles/kernel/bvh/obvh_volume.h +++ b/intern/cycles/kernel/bvh/obvh_volume.h @@ -52,12 +52,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, Transform ob_itfm; #endif -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif - isect->t = ray->t; isect->u = 0.0f; isect->v = 0.0f; diff --git a/intern/cycles/kernel/bvh/obvh_volume_all.h b/intern/cycles/kernel/bvh/obvh_volume_all.h index a88573e6f86..87216127ddb 100644 --- a/intern/cycles/kernel/bvh/obvh_volume_all.h +++ b/intern/cycles/kernel/bvh/obvh_volume_all.h @@ -58,12 +58,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(OBVH)(KernelGlobals *kg, uint num_hits = 0; isect_array->t = tmax; -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return 0; - } -#endif - #if BVH_FEATURE(BVH_INSTANCING) int num_hits_in_instance = 0; #endif diff --git a/intern/cycles/kernel/bvh/qbvh_local.h b/intern/cycles/kernel/bvh/qbvh_local.h index ee3827de309..22d434a8737 100644 --- a/intern/cycles/kernel/bvh/qbvh_local.h +++ b/intern/cycles/kernel/bvh/qbvh_local.h @@ -82,12 +82,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, object = local_object; } -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif - ssef tnear(0.0f), tfar(isect_t); #if BVH_FEATURE(BVH_HAIR) sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z)); diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index 46fd178aed6..37606e10b92 100644 --- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -66,11 +66,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, *num_hits = 0; isect_array->t = tmax; -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif #if BVH_FEATURE(BVH_INSTANCING) int num_hits_in_instance = 0; @@ -103,7 +98,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - (void)inodes; + (void) inodes; if(false #ifdef __VISIBILITY_FLAG__ diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h index 335a4afd47a..35c6e3aeec9 100644 --- a/intern/cycles/kernel/bvh/qbvh_traversal.h +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -71,12 +71,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, Transform ob_itfm; #endif -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif - isect->t = ray->t; isect->u = 0.0f; isect->v = 0.0f; @@ -112,7 +106,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - (void)inodes; + (void) inodes; if(UNLIKELY(node_dist > isect->t) #if BVH_FEATURE(BVH_MOTION) @@ -188,7 +182,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, float4 cnodes; /* TODO(sergey): Investigate whether moving cnodes upwards * gives a speedup (will be different cache pattern but will - * avoid extra check here), + * avoid extra check here). */ #if BVH_FEATURE(BVH_HAIR) if(__float_as_uint(inodes.x) & PATH_RAY_NODE_UNALIGNED) { diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h index 192ce009524..7ec264e5f78 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume.h +++ b/intern/cycles/kernel/bvh/qbvh_volume.h @@ -58,12 +58,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, Transform ob_itfm; #endif -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return false; - } -#endif - isect->t = ray->t; isect->u = 0.0f; isect->v = 0.0f; diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h index 1e454e4d36b..dd603d79334 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume_all.h +++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h @@ -64,12 +64,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, uint num_hits = 0; isect_array->t = tmax; -#ifndef __KERNEL_SSE41__ - if(!isfinite(P.x)) { - return 0; - } -#endif - #if BVH_FEATURE(BVH_INSTANCING) int num_hits_in_instance = 0; #endif diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h index ff238b7a834..4e7425bd800 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h @@ -232,4 +232,4 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, CCL_NAMESPACE_END -#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */ +#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h index b0bdea723b9..80fd9ba2b37 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h @@ -158,4 +158,4 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng, CCL_NAMESPACE_END -#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */ +#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h index ee6d4cdf2df..946c460a70e 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse.h @@ -139,4 +139,4 @@ ccl_device int bsdf_translucent_sample(const ShaderClosure *sc, float3 Ng, float CCL_NAMESPACE_END -#endif /* __BSDF_DIFFUSE_H__ */ +#endif /* __BSDF_DIFFUSE_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h index 35bb2fdf0e8..ca33a5b275c 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h @@ -103,8 +103,8 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc, float3 Ng, floa return LABEL_REFLECT|LABEL_DIFFUSE; } -#endif /* __OSL__ */ +#endif /* __OSL__ */ CCL_NAMESPACE_END -#endif /* __BSDF_DIFFUSE_RAMP_H__ */ +#endif /* __BSDF_DIFFUSE_RAMP_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h index 7b44a23f05b..e1a0cfaa3f5 100644 --- a/intern/cycles/kernel/closure/bsdf_hair.h +++ b/intern/cycles/kernel/closure/bsdf_hair.h @@ -277,4 +277,4 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, CCL_NAMESPACE_END -#endif /* __BSDF_HAIR_H__ */ +#endif /* __BSDF_HAIR_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h index b3b56be39ff..68335ee887a 100644 --- a/intern/cycles/kernel/closure/bsdf_hair_principled.h +++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h @@ -229,7 +229,7 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } -#endif /* __HAIR__ */ +#endif /* __HAIR__ */ /* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */ ccl_device_inline void hair_attenuation(KernelGlobals *kg, @@ -296,7 +296,7 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg, float3 Y = float4_to_float3(bsdf->extra->geom); float3 X = safe_normalize(sd->dPdu); - kernel_assert(fabsf(dot(X, Y)) < 1e-4f); + kernel_assert(fabsf(dot(X, Y)) < 1e-3f); float3 Z = safe_normalize(cross(X, Y)); float3 wo = make_float3(dot(sd->I, X), dot(sd->I, Y), dot(sd->I, Z)); @@ -378,7 +378,7 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg, float3 Y = float4_to_float3(bsdf->extra->geom); float3 X = safe_normalize(sd->dPdu); - kernel_assert(fabsf(dot(X, Y)) < 1e-4f); + kernel_assert(fabsf(dot(X, Y)) < 1e-3f); float3 Z = safe_normalize(cross(X, Y)); float3 wo = make_float3(dot(sd->I, X), dot(sd->I, Y), dot(sd->I, Z)); @@ -499,4 +499,4 @@ ccl_device void bsdf_principled_hair_blur(ShaderClosure *sc, float roughness) CCL_NAMESPACE_END -#endif /* __BSDF_HAIR_PRINCIPLED_H__ */ +#endif /* __BSDF_HAIR_PRINCIPLED_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index e74d5ebaa42..32b6e50b09a 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -1124,4 +1124,4 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl CCL_NAMESPACE_END -#endif /* __BSDF_MICROFACET_H__ */ +#endif /* __BSDF_MICROFACET_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index e73915dbda7..5d300ef6db5 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -76,7 +76,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( eval *= -lambda_r / (shadowing_lambda - lambda_r); else eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f); -#else /* MF_MULTI_GLOSSY */ +#else /* MF_MULTI_GLOSSY */ const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda); float val = G2 * 0.25f / wi.z; if(alpha.x == alpha.y) @@ -129,7 +129,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta); else phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta); -#else /* MF_MULTI_GLOSSY */ +#else /* MF_MULTI_GLOSSY */ phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput; #endif eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda); @@ -153,7 +153,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( else if(use_fresnel && order > 0) { throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0); } -#else /* MF_MULTI_GLOSSY */ +#else /* MF_MULTI_GLOSSY */ if(use_fresnel && order > 0) { throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); } @@ -248,7 +248,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)( throughput *= t_color; } } -#else /* MF_MULTI_GLOSSY */ +#else /* MF_MULTI_GLOSSY */ if(use_fresnel) { float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h index 6b770fc0c16..3446d1609d9 100644 --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h @@ -108,4 +108,4 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3 CCL_NAMESPACE_END -#endif /* __BSDF_OREN_NAYAR_H__ */ +#endif /* __BSDF_OREN_NAYAR_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h index 91c7803346d..83da05ac435 100644 --- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h @@ -135,8 +135,8 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, float3 Ng, float3 return LABEL_REFLECT|LABEL_GLOSSY; } -#endif /* __OSL__ */ +#endif /* __OSL__ */ CCL_NAMESPACE_END -#endif /* __BSDF_PHONG_RAMP_H__ */ +#endif /* __BSDF_PHONG_RAMP_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h index 83be2b35a00..2f65fd54be2 100644 --- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h @@ -122,4 +122,4 @@ ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc, CCL_NAMESPACE_END -#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */ +#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h index 8b7c4399516..ccdcb1babd2 100644 --- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h +++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h @@ -108,4 +108,4 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc, CCL_NAMESPACE_END -#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */ +#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h index b33b6e3597b..94f1c283af7 100644 --- a/intern/cycles/kernel/closure/bsdf_reflection.h +++ b/intern/cycles/kernel/closure/bsdf_reflection.h @@ -77,4 +77,4 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 CCL_NAMESPACE_END -#endif /* __BSDF_REFLECTION_H__ */ +#endif /* __BSDF_REFLECTION_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h index b181650e928..abdd01c7a1d 100644 --- a/intern/cycles/kernel/closure/bsdf_refraction.h +++ b/intern/cycles/kernel/closure/bsdf_refraction.h @@ -86,4 +86,4 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc, float3 Ng, float3 CCL_NAMESPACE_END -#endif /* __BSDF_REFRACTION_H__ */ +#endif /* __BSDF_REFRACTION_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h index 6d8074b7130..097a56f22eb 100644 --- a/intern/cycles/kernel/closure/bsdf_toon.h +++ b/intern/cycles/kernel/closure/bsdf_toon.h @@ -215,4 +215,4 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc, float3 Ng, float CCL_NAMESPACE_END -#endif /* __BSDF_TOON_H__ */ +#endif /* __BSDF_TOON_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h index f788dbcd0ff..060dff69f52 100644 --- a/intern/cycles/kernel/closure/bsdf_transparent.h +++ b/intern/cycles/kernel/closure/bsdf_transparent.h @@ -106,4 +106,4 @@ ccl_device int bsdf_transparent_sample(const ShaderClosure *sc, float3 Ng, float CCL_NAMESPACE_END -#endif /* __BSDF_TRANSPARENT_H__ */ +#endif /* __BSDF_TRANSPARENT_H__ */ diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h index b080e025d16..4f3453675c7 100644 --- a/intern/cycles/kernel/closure/bsdf_util.h +++ b/intern/cycles/kernel/closure/bsdf_util.h @@ -158,4 +158,4 @@ ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, floa CCL_NAMESPACE_END -#endif /* __BSDF_UTIL_H__ */ +#endif /* __BSDF_UTIL_H__ */ diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index ba0c6ae8c61..98c7f23c288 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -499,4 +499,4 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r) CCL_NAMESPACE_END -#endif /* __KERNEL_BSSRDF_H__ */ +#endif /* __KERNEL_BSSRDF_H__ */ diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h index f6e474d6702..4209d69ee73 100644 --- a/intern/cycles/kernel/filter/filter.h +++ b/intern/cycles/kernel/filter/filter.h @@ -49,4 +49,4 @@ CCL_NAMESPACE_BEGIN CCL_NAMESPACE_END -#endif /* __FILTER_H__ */ +#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h index 1a2f22a6987..67f4e62ac0f 100644 --- a/intern/cycles/kernel/filter/filter_defines.h +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -68,4 +68,4 @@ typedef struct TileInfo { # define ccl_get_tile_buffer(id) (tile_info->buffers[id]) #endif -#endif /* __FILTER_DEFINES_H__*/ +#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index e2da0fd872b..af73c0dadf2 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -16,6 +16,9 @@ CCL_NAMESPACE_BEGIN +#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs)))) +#define load4_u(buf, ofs) load_float4((buf)+(ofs)) + ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, @@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float a, float k_2) { + /* Strides need to be aligned to 16 bytes. */ + kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); + + int aligned_lowx = rect.x & (~3); + const int numChannels = (channel_offset > 0)? 3 : 1; + const float4 channel_fac = make_float4(1.0f / numChannels); + for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - float diff = 0.0f; - int numChannels = channel_offset? 3 : 1; - for(int c = 0; c < numChannels; c++) { - float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - float pvar = variance_image[c*channel_offset + y*stride + x]; - float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); - } - if(numChannels > 1) { - diff *= 1.0f/numChannels; + int idx_p = y*stride + aligned_lowx; + int idx_q = (y+dy)*stride + aligned_lowx + dx; + for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { + float4 diff = make_float4(0.0f); + for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { + /* idx_p is guaranteed to be aligned, but idx_q isn't. */ + float4 color_p = load4_a(weight_image, idx_p + chan_ofs); + float4 color_q = load4_u(weight_image, idx_q + chan_ofs); + float4 cdiff = color_p - color_q; + float4 var_p = load4_a(variance_image, idx_p + chan_ofs); + float4 var_q = load4_u(variance_image, idx_q + chan_ofs); + diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); } - difference_image[y*stride + x] = diff; + load4_a(difference_image, idx_p) = diff*channel_fac; } } } @@ -50,52 +61,77 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen int stride, int f) { - int aligned_lowx = rect.x / 4; - int aligned_highx = (rect.z + 3) / 4; + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] = 0.0f; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = make_float4(0.0f); } for(int y1 = low; y1 < high; y1++) { - float4* out_image4 = (float4*)(out_image + y*stride); - float4* difference_image4 = (float4*)(difference_image + y1*stride); - for(int x = aligned_lowx; x < aligned_highx; x++) { - out_image4[x] += difference_image4[x]; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x); } } - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] *= 1.0f/(high - low); + float fac = 1.0f/(high - low); + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) *= fac; } } } -ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, - float *out_image, - int4 rect, - int stride, - int f) +ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int stride, + int f) { + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] = 0.0f; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = make_float4(0.0f); } } + for(int dx = -f; dx <= f; dx++) { - int pos_dx = max(0, dx); - int neg_dx = min(0, dx); + aligned_lowx = round_down(rect.x - min(0, dx), 4); + int highx = rect.z - max(0, dx); + int4 lowx4 = make_int4(rect.x - min(0, dx)); + int4 highx4 = make_int4(rect.z - max(0, dx)); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { - out_image[y*stride + x] += difference_image[y*stride + x+dx]; + for(int x = aligned_lowx; x < highx; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= lowx4) & (x4 < highx4); + + float4 diff = load4_u(difference_image, y*stride + x + dx); + load4_a(out_image, y*stride + x) += mask(active, diff); } } } + + aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f)); + for(int x = aligned_lowx; x < rect.z; x += 4) { + float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); + float4 low = max(make_float4(rect.x), x4 - make_float4(f)); + float4 high = min(make_float4(rect.z), x4 + make_float4(f+1)); + load4_a(out_image, y*stride + x) *= rcp(high - low); + } + } +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int stride, + int f) +{ + nlm_blur_horizontal(difference_image, out_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); + for(int y = rect.y; y < rect.w; y++) { + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f))); } } } @@ -103,23 +139,29 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, const float *ccl_restrict difference_image, const float *ccl_restrict image, + float *temp_image, float *out_image, float *accum_image, int4 rect, int stride, int f) { + nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - float sum = 0.0f; - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - float weight = sum * (1.0f/(high - low)); - accum_image[y*stride + x] += weight; - out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)]; + for(int x = aligned_lowx; x < rect.z; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); + + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); + + float4 weight = load4_a(temp_image, idx_p); + load4_a(accum_image, idx_p) += mask(active, weight); + + float4 val = load4_u(image, idx_q); + + load4_a(out_image, idx_p) += mask(active, weight*val); } } } @@ -177,4 +219,7 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, } } +#undef load4_a +#undef load4_u + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h index 4cfbe21685c..b6b58b52a29 100644 --- a/intern/cycles/kernel/geom/geom_curve_intersect.h +++ b/intern/cycles/kernel/geom/geom_curve_intersect.h @@ -379,7 +379,7 @@ ccl_device_forceinline bool cardinal_curve_intersect( float inv_mw_extension = 1.0f/mw_extension; if(d0 >= 0) coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; - else // inside + else // inside coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; } @@ -817,16 +817,24 @@ ccl_device_inline float3 curve_refine(KernelGlobals *kg, sd->Ng = normalize(-(D - tg * (dot(tg, D)))); } else { - /* direction from inside to surface of curve */ - float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - sd->Ng = normalize(P - p_curr); +#ifdef __EMBREE__ + if(kernel_data.bvh.scene) { + sd->Ng = normalize(isect->Ng); + } + else +#endif + { + /* direction from inside to surface of curve */ + float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); + sd->Ng = normalize(P - p_curr); - /* adjustment for changing radius */ - float gd = isect->v; + /* adjustment for changing radius */ + float gd = isect->v; - if(gd != 0.0f) { - sd->Ng = sd->Ng - gd * tg; - sd->Ng = normalize(sd->Ng); + if(gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } } } diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index cfe17e63627..669c932d720 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -78,6 +78,12 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int const uint num_steps = kernel_tex_fetch(__objects, object).numsteps * 2 + 1; Transform tfm; +#ifdef __EMBREE__ + if(kernel_data.bvh.scene) { + transform_motion_array_interpolate_straight(&tfm, motion, num_steps, time); + } + else +#endif transform_motion_array_interpolate(&tfm, motion, num_steps, time); return tfm; @@ -304,6 +310,24 @@ ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd) return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id; } +/* Cryptomatte ID */ + +ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object) +{ + if(object == OBJECT_NONE) + return 0.0f; + + return kernel_tex_fetch(__objects, object).cryptomatte_object; +} + +ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object) +{ + if(object == OBJECT_NONE) + return 0; + + return kernel_tex_fetch(__objects, object).cryptomatte_asset; +} + /* Particle data from which object was instanced */ ccl_device_inline uint particle_index(KernelGlobals *kg, int particle) diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h index 00ce89ae567..8c0d0a9770e 100644 --- a/intern/cycles/kernel/geom/geom_subd_triangle.h +++ b/intern/cycles/kernel/geom/geom_subd_triangle.h @@ -146,7 +146,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const return a; } else -#endif /* __PATCH_EVAL__ */ +#endif /* __PATCH_EVAL__ */ if(desc.element == ATTR_ELEMENT_FACE) { if(dx) *dx = 0.0f; if(dy) *dy = 0.0f; @@ -271,7 +271,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con return a; } else -#endif /* __PATCH_EVAL__ */ +#endif /* __PATCH_EVAL__ */ if(desc.element == ATTR_ELEMENT_FACE) { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index aa6b102a0f3..57f4c86d403 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -71,28 +71,23 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, } #ifdef __KERNEL_AVX2__ - #define cross256(A,B, C,D) _mm256_fmsub_ps(A,B, _mm256_mul_ps(C,D)) -#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 -ccl_device_inline -#else -ccl_device_forceinline -#endif -int ray_triangle_intersect8(KernelGlobals *kg, - float3 ray_P, - float3 ray_dir, - Intersection **isect, - uint visibility, - int object, - __m256 *triA, - __m256 *triB, - __m256 *triC, - int prim_addr, - int prim_num, - uint *num_hits, - uint max_hits, - int *num_hits_in_instance, - float isec_t) +ccl_device_inline int ray_triangle_intersect8( + KernelGlobals *kg, + float3 ray_P, + float3 ray_dir, + Intersection **isect, + uint visibility, + int object, + __m256 *triA, + __m256 *triB, + __m256 *triC, + int prim_addr, + int prim_num, + uint *num_hits, + uint max_hits, + int *num_hits_in_instance, + float isect_t) { const unsigned char prim_num_mask = (1 << prim_num) - 1; @@ -108,10 +103,6 @@ int ray_triangle_intersect8(KernelGlobals *kg, const __m256 dirz256 = _mm256_set1_ps(ray_dir.z); /* Calculate vertices relative to ray origin. */ - /* const float3 v0 = tri_c - P; - const float3 v1 = tri_a - P; - const float3 v2 = tri_b - P; */ - __m256 v0_x_256 = _mm256_sub_ps(triC[0], Px256); __m256 v0_y_256 = _mm256_sub_ps(triC[1], Py256); __m256 v0_z_256 = _mm256_sub_ps(triC[2], Pz256); @@ -136,11 +127,7 @@ int ray_triangle_intersect8(KernelGlobals *kg, __m256 v1_v2_y_256 = _mm256_add_ps(v1_y_256, v2_y_256); __m256 v1_v2_z_256 = _mm256_add_ps(v1_z_256, v2_z_256); - /* Calculate triangle edges. - const float3 e0 = v2 - v0; - const float3 e1 = v0 - v1; - const float3 e2 = v1 - v2;*/ - + /* Calculate triangle edges. */ __m256 e0_x_256 = _mm256_sub_ps(v2_x_256, v0_x_256); __m256 e0_y_256 = _mm256_sub_ps(v2_y_256, v0_y_256); __m256 e0_z_256 = _mm256_sub_ps(v2_z_256, v0_z_256); @@ -153,48 +140,32 @@ int ray_triangle_intersect8(KernelGlobals *kg, __m256 e2_y_256 = _mm256_sub_ps(v1_y_256, v2_y_256); __m256 e2_z_256 = _mm256_sub_ps(v1_z_256, v2_z_256); - /* Perform edge tests. - const float U = dot(cross(v2 + v0, e0), ray_dir); - const float V = dot(cross(v0 + v1, e1), ray_dir); - const float W = dot(cross(v1 + v2, e2), ray_dir);*/ - - //cross (AyBz - AzBy, AzBx -AxBz, AxBy - AyBx) + /* Perform edge tests. */ + /* cross (AyBz - AzBy, AzBx -AxBz, AxBy - AyBx) */ __m256 U_x_256 = cross256(v0_v2_y_256, e0_z_256, v0_v2_z_256, e0_y_256); __m256 U_y_256 = cross256(v0_v2_z_256, e0_x_256, v0_v2_x_256, e0_z_256); __m256 U_z_256 = cross256(v0_v2_x_256, e0_y_256, v0_v2_y_256, e0_x_256); - //vertical dot + /* vertical dot */ __m256 U_256 = _mm256_mul_ps(U_x_256, dirx256); - U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_y_256, diry256)); - U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); //_mm256_add_ps(U_256, _mm256_mul_ps(U_z_256, dirz256)); + U_256 = _mm256_fmadd_ps(U_y_256, diry256, U_256); + U_256 = _mm256_fmadd_ps(U_z_256, dirz256, U_256); __m256 V_x_256 = cross256(v0_v1_y_256, e1_z_256, v0_v1_z_256, e1_y_256); __m256 V_y_256 = cross256(v0_v1_z_256, e1_x_256, v0_v1_x_256, e1_z_256); __m256 V_z_256 = cross256(v0_v1_x_256, e1_y_256, v0_v1_y_256, e1_x_256); - //vertical dot + /* vertical dot */ __m256 V_256 = _mm256_mul_ps(V_x_256, dirx256); - V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_y_256, diry256)); - V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256);// _mm256_add_ps(V_256, _mm256_mul_ps(V_z_256, dirz256)); + V_256 = _mm256_fmadd_ps(V_y_256, diry256, V_256); + V_256 = _mm256_fmadd_ps(V_z_256, dirz256, V_256); __m256 W_x_256 = cross256(v1_v2_y_256, e2_z_256, v1_v2_z_256, e2_y_256); __m256 W_y_256 = cross256(v1_v2_z_256, e2_x_256, v1_v2_x_256, e2_z_256); __m256 W_z_256 = cross256(v1_v2_x_256, e2_y_256, v1_v2_y_256, e2_x_256); - //vertical dot + /* vertical dot */ __m256 W_256 = _mm256_mul_ps(W_x_256, dirx256); - W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_y_256, diry256)); - W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256);//_mm256_add_ps(W_256, _mm256_mul_ps(W_z_256, dirz256)); - - //const float minUVW = min(U, min(V, W)); - //const float maxUVW = max(U, max(V, W)); -#if 0 - __m256 minUVW_256 = _mm256_min_ps(U_256, _mm256_min_ps(V_256, W_256)); - __m256 maxUVW_256 = _mm256_max_ps(U_256, _mm256_max_ps(V_256, W_256)); - - //if(minUVW < 0.0f && maxUVW > 0.0f) - __m256i mask_minmaxUVW_256 = _mm256_and_si256( - _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(minUVW_256)), - //_mm256_castps_si256(minUVW_256), - _mm256_cmpgt_epi32(_mm256_castps_si256(maxUVW_256), zero256)); -#else + W_256 = _mm256_fmadd_ps(W_y_256, diry256,W_256); + W_256 = _mm256_fmadd_ps(W_z_256, dirz256,W_256); + __m256i U_256_1 = _mm256_srli_epi32(_mm256_castps_si256(U_256), 31); __m256i V_256_1 = _mm256_srli_epi32(_mm256_castps_si256(V_256), 31); __m256i W_256_1 = _mm256_srli_epi32(_mm256_castps_si256(W_256), 31); @@ -204,9 +175,8 @@ int ray_triangle_intersect8(KernelGlobals *kg, const __m256i two256 = _mm256_set1_epi32(2); __m256i mask_minmaxUVW_256 = _mm256_or_si256( - _mm256_cmpeq_epi32(one256, UVW_256_1), - _mm256_cmpeq_epi32(two256, UVW_256_1) ); -#endif + _mm256_cmpeq_epi32(one256, UVW_256_1), + _mm256_cmpeq_epi32(two256, UVW_256_1)); unsigned char mask_minmaxUVW_pos = _mm256_movemask_ps(_mm256_castsi256_ps(mask_minmaxUVW_256)); if((mask_minmaxUVW_pos & prim_num_mask) == prim_num_mask) { //all bits set @@ -214,231 +184,187 @@ int ray_triangle_intersect8(KernelGlobals *kg, } /* Calculate geometry normal and denominator. */ - // const float3 Ng1 = cross(e1, e0); - //const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0); - __m256 Ng1_x_256 = cross256(e1_y_256, e0_z_256, e1_z_256, e0_y_256); __m256 Ng1_y_256 = cross256(e1_z_256, e0_x_256, e1_x_256, e0_z_256); __m256 Ng1_z_256 = cross256(e1_x_256, e0_y_256, e1_y_256, e0_x_256); - //const float3 Ng = Ng1 + Ng1; Ng1_x_256 = _mm256_add_ps(Ng1_x_256, Ng1_x_256); Ng1_y_256 = _mm256_add_ps(Ng1_y_256, Ng1_y_256); Ng1_z_256 = _mm256_add_ps(Ng1_z_256, Ng1_z_256); - //const float den = dot3(Ng, dir); - //vertical dot + /* vertical dot */ __m256 den_256 = _mm256_mul_ps(Ng1_x_256, dirx256); - den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_y_256, diry256)); - den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256);//_mm256_add_ps(den_256, _mm256_mul_ps(Ng1_z_256, dirz256)); - - // __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256); + den_256 = _mm256_fmadd_ps(Ng1_y_256, diry256,den_256); + den_256 = _mm256_fmadd_ps(Ng1_z_256, dirz256,den_256); /* Perform depth test. */ - //const float T = dot3(v0, Ng); __m256 T_256 = _mm256_mul_ps(Ng1_x_256, v0_x_256); - T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_y_256, v0_y_256)); - T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256);//_mm256_add_ps(T_256, _mm256_mul_ps(Ng1_z_256, v0_z_256)); + T_256 = _mm256_fmadd_ps(Ng1_y_256, v0_y_256,T_256); + T_256 = _mm256_fmadd_ps(Ng1_z_256, v0_z_256,T_256); - //const int sign_den = (__float_as_int(den) & 0x80000000); const __m256i c0x80000000 = _mm256_set1_epi32(0x80000000); __m256i sign_den_256 = _mm256_and_si256(_mm256_castps_si256(den_256), c0x80000000); - //const float sign_T = xor_signmask(T, sign_den); __m256 sign_T_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(T_256), sign_den_256)); - /*if((sign_T < 0.0f) || mask_minmaxUVW_pos { return false;} */ unsigned char mask_sign_T = _mm256_movemask_ps(sign_T_256); if(((mask_minmaxUVW_pos | mask_sign_T) & prim_num_mask) == prim_num_mask) { return false; - } /**/ + } __m256 xor_signmask_256 = _mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)); - ccl_align(32) float den8[8], U8[8], V8[8], T8[8], sign_T8[8], xor_signmask8[8]; ccl_align(32) unsigned int mask_minmaxUVW8[8]; - if(visibility == PATH_RAY_SHADOW_OPAQUE){ - __m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256);//~mask_minmaxUVW_256 - - __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256); - - __m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256)); - __m256 rayt_256 = _mm256_set1_ps((*isect)->t); - - __m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256), - _mm256_castps_si256( - _mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256) - ) - ); - /* __m256i mask1 = _mm256_castps_si256(_mm256_cmp_ps(sign_T_256, - _mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256), - _CMP_GT_OS - ) );*/ - - mask0 = _mm256_or_si256(mask1, mask0); - //unsigned char mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask0)); - //unsigned char maskden = _mm256_movemask_ps(_mm256_castsi256_ps(maskden256)); - //unsigned char mask_final = ((~mask) & (~maskden) & (~mask_minmaxUVW_pos)); - mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) - mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden) - - unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256)); - if((mask_final & prim_num_mask) == 0) { //all bits NOT set - return false; - } /**/ - - unsigned long i = 0; -#if defined(_MSC_VER) - unsigned char res = _BitScanForward(&i, (unsigned long)mask_final); -#else - i = __builtin_ffs(mask_final)-1; -#endif - - den_256 = _mm256_rcp_ps(den_256); //inv_den - U_256 = _mm256_mul_ps(U_256, den_256); //*inv_den - V_256 = _mm256_mul_ps(V_256, den_256); //*inv_den - T_256 = _mm256_mul_ps(T_256, den_256); //*inv_den - - _mm256_store_ps(U8, U_256); - _mm256_store_ps(V8, V_256); - _mm256_store_ps(T8, T_256); - - - //here we assume (kernel_tex_fetch(__prim_visibility, (prim_addr +i)) & visibility) is always true - - (*isect)->u = U8[i]; - (*isect)->v = V8[i]; - (*isect)->t = T8[i]; - - (*isect)->prim = (prim_addr + i); - (*isect)->object = object; - (*isect)->type = PRIMITIVE_TRIANGLE; - - return true; + if(visibility == PATH_RAY_SHADOW_OPAQUE) { + __m256i mask_final_256 = _mm256_cmpeq_epi32(mask_minmaxUVW_256, zero256); + __m256i maskden256 = _mm256_cmpeq_epi32(_mm256_castps_si256(den_256), zero256); + __m256i mask0 = _mm256_cmpgt_epi32(zero256, _mm256_castps_si256(sign_T_256)); + __m256 rayt_256 = _mm256_set1_ps((*isect)->t); + __m256i mask1 = _mm256_cmpgt_epi32(_mm256_castps_si256(sign_T_256), + _mm256_castps_si256( + _mm256_mul_ps(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(den_256), sign_den_256)), rayt_256) + ) + ); + mask0 = _mm256_or_si256(mask1, mask0); + mask_final_256 = _mm256_andnot_si256(mask0, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) + mask_final_256 = _mm256_andnot_si256(maskden256, mask_final_256); //(~mask_minmaxUVW_pos) &(~mask) & (~maskden) + unsigned char mask_final = _mm256_movemask_ps(_mm256_castsi256_ps(mask_final_256)); + if((mask_final & prim_num_mask) == 0) { + return false; } + const int i = __bsf(mask_final); + __m256 inv_den_256 = _mm256_rcp_ps(den_256); + U_256 = _mm256_mul_ps(U_256, inv_den_256); + V_256 = _mm256_mul_ps(V_256, inv_den_256); + T_256 = _mm256_mul_ps(T_256, inv_den_256); + _mm256_store_ps(U8, U_256); + _mm256_store_ps(V8, V_256); + _mm256_store_ps(T8, T_256); + /* NOTE: Here we assume visibility for all triangles in the node is + * the same. */ + (*isect)->u = U8[i]; + (*isect)->v = V8[i]; + (*isect)->t = T8[i]; + (*isect)->prim = (prim_addr + i); + (*isect)->object = object; + (*isect)->type = PRIMITIVE_TRIANGLE; + return true; + } else { - _mm256_store_ps(den8, den_256); - _mm256_store_ps(U8, U_256); - _mm256_store_ps(V8, V_256); - _mm256_store_ps(T8, T_256); + _mm256_store_ps(den8, den_256); + _mm256_store_ps(U8, U_256); + _mm256_store_ps(V8, V_256); + _mm256_store_ps(T8, T_256); - _mm256_store_ps(sign_T8, sign_T_256); - _mm256_store_ps(xor_signmask8, xor_signmask_256); - _mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256); + _mm256_store_ps(sign_T8, sign_T_256); + _mm256_store_ps(xor_signmask8, xor_signmask_256); + _mm256_store_si256((__m256i*)mask_minmaxUVW8, mask_minmaxUVW_256); - int ret = false; + int ret = false; - if(visibility == PATH_RAY_SHADOW) { - for(int i = 0; i < prim_num; i++) { - if(!mask_minmaxUVW8[i]) { + if(visibility == PATH_RAY_SHADOW) { + for(int i = 0; i < prim_num; i++) { + if(mask_minmaxUVW8[i]) { + continue; + } #ifdef __VISIBILITY_FLAG__ - if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) + if((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) { + continue; + } #endif - { - if((sign_T8[i] >= 0.0f) && - (sign_T8[i] <= (*isect)->t * xor_signmask8[i])) - { - if(den8[i]) { - const float inv_den = 1.0f / den8[i]; - - (*isect)->u = U8[i] * inv_den; - (*isect)->v = V8[i] * inv_den; - (*isect)->t = T8[i] * inv_den; - - (*isect)->prim = (prim_addr + i); - (*isect)->object = object; - (*isect)->type = PRIMITIVE_TRIANGLE; - - int prim = kernel_tex_fetch(__prim_index, (*isect)->prim); - int shader = 0; - + if((sign_T8[i] < 0.0f) || + (sign_T8[i] > (*isect)->t * xor_signmask8[i])) + { + continue; + } + if(!den8[i]) { + continue; + } + const float inv_den = 1.0f / den8[i]; + (*isect)->u = U8[i] * inv_den; + (*isect)->v = V8[i] * inv_den; + (*isect)->t = T8[i] * inv_den; + (*isect)->prim = (prim_addr + i); + (*isect)->object = object; + (*isect)->type = PRIMITIVE_TRIANGLE; + const int prim = kernel_tex_fetch(__prim_index, (*isect)->prim); + int shader = 0; #ifdef __HAIR__ - if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE) + if(kernel_tex_fetch(__prim_type, (*isect)->prim) & PRIMITIVE_ALL_TRIANGLE) #endif - { - shader = kernel_tex_fetch(__tri_shader, prim); - } + { + shader = kernel_tex_fetch(__tri_shader, prim); + } #ifdef __HAIR__ - else { - float4 str = kernel_tex_fetch(__curves, prim); - shader = __float_as_int(str.z); - } + else { + float4 str = kernel_tex_fetch(__curves, prim); + shader = __float_as_int(str.z); + } #endif - int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; - - /* if no transparent shadows, all light is blocked */ - if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { - return 2; - } - /* if maximum number of hits reached, block all light */ - else if(*num_hits == max_hits) { - return 2; - } - /* move on to next entry in intersections array */ - ret = true; - - (*isect)++; - (*num_hits)++; - - (*num_hits_in_instance)++; - - (*isect)->t = isec_t; - - } //den - } //if sign - } //vis - }//if mask - } //for + const int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; + /* If no transparent shadows, all light is blocked. */ + if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { + return 2; + } + /* If maximum number of hits reached, block all light. */ + else if(num_hits == NULL || *num_hits == max_hits) { + return 2; + } + /* Move on to next entry in intersections array. */ + ret = true; + (*isect)++; + (*num_hits)++; + (*num_hits_in_instance)++; + (*isect)->t = isect_t; + } } - else { //default case + else { for(int i = 0; i < prim_num; i++) { - if(!mask_minmaxUVW8[i]) { + if(mask_minmaxUVW8[i]) { + continue; + } #ifdef __VISIBILITY_FLAG__ - if(kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) + if((kernel_tex_fetch(__prim_visibility, (prim_addr + i)) & visibility) == 0) { + continue; + } #endif - { - if((sign_T8[i] >= 0.0f) && - (sign_T8[i] <= (*isect)->t * xor_signmask8[i])) - { - if(den8[i]) { - const float inv_den = 1.0f / den8[i]; - - (*isect)->u = U8[i] * inv_den; - (*isect)->v = V8[i] * inv_den; - (*isect)->t = T8[i] * inv_den; - - (*isect)->prim = (prim_addr + i); - (*isect)->object = object; - (*isect)->type = PRIMITIVE_TRIANGLE; - - ret = true; - } //den - } //if sign - } //vis - }//if mask - } //for - } //default - return ret; -}// else PATH_RAY_SHADOW_OPAQUE - + if((sign_T8[i] < 0.0f) || + (sign_T8[i] > (*isect)->t * xor_signmask8[i])) + { + continue; + } + if(!den8[i]) { + continue; + } + const float inv_den = 1.0f / den8[i]; + (*isect)->u = U8[i] * inv_den; + (*isect)->v = V8[i] * inv_den; + (*isect)->t = T8[i] * inv_den; + (*isect)->prim = (prim_addr + i); + (*isect)->object = object; + (*isect)->type = PRIMITIVE_TRIANGLE; + ret = true; + } + } + return ret; + } } -//vz static -ccl_device_inline -int triangle_intersect8(KernelGlobals *kg, - Intersection **isect, - float3 P, - float3 dir, - uint visibility, - int object, - int prim_addr, - int prim_num, - uint *num_hits, - uint max_hits, - int *num_hits_in_instance, - float isec_t) +ccl_device_inline int triangle_intersect8( + KernelGlobals *kg, + Intersection **isect, + float3 P, + float3 dir, + uint visibility, + int object, + int prim_addr, + int prim_num, + uint *num_hits, + uint max_hits, + int *num_hits_in_instance, + float isect_t) { __m128 tri_a[8], tri_b[8], tri_c[8]; __m256 tritmp[12], tri[12]; @@ -540,11 +466,11 @@ int triangle_intersect8(KernelGlobals *kg, num_hits, max_hits, num_hits_in_instance, - isec_t); + isect_t); return result; } -#endif /* __KERNEL_AVX2__ */ +#endif /* __KERNEL_AVX2__ */ /* Special ray intersection routines for subsurface scattering. In that case we * only want to intersect with primitives in the same object, and if case of diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 373324afb01..1c8c91d15e6 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -63,4 +63,4 @@ void kernel_tex_copy(KernelGlobals *kg, CCL_NAMESPACE_END -#endif /* __KERNEL_H__ */ +#endif /* __KERNEL_H__ */ diff --git a/intern/cycles/kernel/kernel_color.h b/intern/cycles/kernel/kernel_color.h index 990e798543a..ea478a8a5d3 100644 --- a/intern/cycles/kernel/kernel_color.h +++ b/intern/cycles/kernel/kernel_color.h @@ -35,4 +35,4 @@ ccl_device float linear_rgb_to_gray(KernelGlobals *kg, float3 c) CCL_NAMESPACE_END -#endif /* __KERNEL_COLOR_H__ */ +#endif /* __KERNEL_COLOR_H__ */ diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index aa7a16afa1d..4ee80850402 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -153,4 +153,4 @@ typedef vector3<avxf> avx3f; CCL_NAMESPACE_END -#endif /* __KERNEL_COMPAT_CPU_H__ */ +#endif /* __KERNEL_COMPAT_CPU_H__ */ diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index ac63bcf7ac9..8ed96bbae64 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -150,4 +150,4 @@ ccl_device_inline uint ccl_num_groups(uint d) #define logf(x) __logf(((float)(x))) #define expf(x) __expf(((float)(x))) -#endif /* __KERNEL_COMPAT_CUDA_H__ */ +#endif /* __KERNEL_COMPAT_CUDA_H__ */ diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 3f7e264fbee..21a95098894 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -159,4 +159,4 @@ #include "util/util_half.h" #include "util/util_types.h" -#endif /* __KERNEL_COMPAT_OPENCL_H__ */ +#endif /* __KERNEL_COMPAT_OPENCL_H__ */ diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 74cfacb5bc1..37402f42863 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -21,6 +21,7 @@ #ifdef __KERNEL_CPU__ # include "util/util_vector.h" +# include "util/util_map.h" #endif #ifdef __KERNEL_OPENCL__ @@ -42,6 +43,8 @@ struct OSLThreadData; struct OSLShadingSystem; # endif +typedef unordered_map<float, float> CoverageMap; + struct Intersection; struct VolumeStep; @@ -68,6 +71,11 @@ typedef struct KernelGlobals { VolumeStep *decoupled_volume_steps[2]; int decoupled_volume_steps_index; + /* A buffer for storing per-pixel coverage for Cryptomatte. */ + CoverageMap *coverage_object; + CoverageMap *coverage_material; + CoverageMap *coverage_asset; + /* split kernel */ SplitData split_data; SplitParams split_param_data; diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h new file mode 100644 index 00000000000..ee3b8b8abfb --- /dev/null +++ b/intern/cycles/kernel/kernel_id_passes.h @@ -0,0 +1,94 @@ +/* +* Copyright 2018 Blender Foundation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer, int num_slots, float id, float weight) +{ + kernel_assert(id != ID_NONE); + if(weight == 0.0f) { + return; + } + + for(int slot = 0; slot < num_slots; slot++) { + ccl_global float2 *id_buffer = (ccl_global float2*)buffer; +#ifdef __ATOMIC_PASS_WRITE__ + /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */ + if(id_buffer[slot].x == ID_NONE) { + /* Use an atomic to claim this slot. + * If a different thread got here first, try again from this slot on. */ + float old_id = atomic_compare_and_swap_float(buffer+slot*2, ID_NONE, id); + if(old_id != ID_NONE && old_id != id) { + continue; + } + atomic_add_and_fetch_float(buffer+slot*2+1, weight); + break; + } + /* If there already is a slot for that ID, add the weight. + * If no slot was found, add it to the last. */ + else if(id_buffer[slot].x == id || slot == num_slots - 1) { + atomic_add_and_fetch_float(buffer+slot*2+1, weight); + break; + } +#else /* __ATOMIC_PASS_WRITE__ */ + /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */ + if(id_buffer[slot].x == ID_NONE) { + id_buffer[slot].x = id; + id_buffer[slot].y = weight; + break; + } + /* If there already is a slot for that ID, add the weight. + * If no slot was found, add it to the last. */ + else if(id_buffer[slot].x == id || slot == num_slots - 1) { + id_buffer[slot].y += weight; + break; + } +#endif /* __ATOMIC_PASS_WRITE__ */ + } +} + +ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots) +{ + ccl_global float2 *id_buffer = (ccl_global float2*)buffer; + for(int slot = 1; slot < num_slots; ++slot) { + if(id_buffer[slot].x == ID_NONE) { + return; + } + /* Since we're dealing with a tiny number of elements, insertion sort should be fine. */ + int i = slot; + while(i > 0 && id_buffer[i].y > id_buffer[i - 1].y) { + float2 swap = id_buffer[i]; + id_buffer[i] = id_buffer[i - 1]; + id_buffer[i - 1] = swap; + --i; + } + } +} + +#ifdef __KERNEL_GPU__ +/* post-sorting for Cryptomatte */ +ccl_device void kernel_cryptomatte_post(KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride) +{ + if(sample - 1 == kernel_data.integrator.aa_samples) { + int index = offset + x + y * stride; + int pass_stride = kernel_data.film.pass_stride; + ccl_global float *cryptomatte_buffer = buffer + index * pass_stride + kernel_data.film.pass_cryptomatte; + kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth); + } +} +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h index 96391db7649..a8a43f3ea4a 100644 --- a/intern/cycles/kernel/kernel_math.h +++ b/intern/cycles/kernel/kernel_math.h @@ -25,4 +25,4 @@ #include "util/util_texture.h" #include "util/util_transform.h" -#endif /* __KERNEL_MATH_H__ */ +#endif /* __KERNEL_MATH_H__ */ diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h index 9b96bb80c32..dde93844dd3 100644 --- a/intern/cycles/kernel/kernel_montecarlo.h +++ b/intern/cycles/kernel/kernel_montecarlo.h @@ -187,7 +187,10 @@ ccl_device float2 regular_polygon_sample(float corners, float rotation, float u, ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N) { float3 R = 2*dot(N, I)*N - I; - if(dot(Ng, R) >= 0.05f) { + + /* Reflection rays may always be at least as shallow as the incoming ray. */ + float threshold = min(0.9f*dot(Ng, I), 0.01f); + if(dot(Ng, R) >= threshold) { return N; } @@ -195,24 +198,88 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N) * The X axis is found by normalizing the component of N that's orthogonal to Ng. * The Y axis isn't actually needed. */ - float3 X = normalize(N - dot(N, Ng)*Ng); - - /* Calculate N.z and N.x in the local coordinate system. */ - float Iz = dot(I, Ng); - float Ix2 = sqr(dot(I, X)), Iz2 = sqr(Iz); - float Ix2Iz2 = Ix2 + Iz2; - - float a = safe_sqrtf(Ix2*(Ix2Iz2 - sqr(0.05f))); - float b = Iz*0.05f + Ix2Iz2; - float c = (a + b > 0.0f)? (a + b) : (-a + b); + float NdotNg = dot(N, Ng); + float3 X = normalize(N - NdotNg*Ng); + + /* Calculate N.z and N.x in the local coordinate system. + * + * The goal of this computation is to find a N' that is rotated towards Ng just enough + * to lift R' above the threshold (here called t), therefore dot(R', Ng) = t. + * + * According to the standard reflection equation, this means that we want dot(2*dot(N', I)*N' - I, Ng) = t. + * + * Since the Z axis of our local coordinate system is Ng, dot(x, Ng) is just x.z, so we get 2*dot(N', I)*N'.z - I.z = t. + * + * The rotation is simple to express in the coordinate system we formed - since N lies in the X-Z-plane, we know that + * N' will also lie in the X-Z-plane, so N'.y = 0 and therefore dot(N', I) = N'.x*I.x + N'.z*I.z . + * + * Furthermore, we want N' to be normalized, so N'.x = sqrt(1 - N'.z^2). + * + * With these simplifications, we get the final equation 2*(sqrt(1 - N'.z^2)*I.x + N'.z*I.z)*N'.z - I.z = t. + * + * The only unknown here is N'.z, so we can solve for that. + * + * The equation has four solutions in general: + * + * N'.z = +-sqrt(0.5*(+-sqrt(I.x^2*(I.x^2 + I.z^2 - t^2)) + t*I.z + I.x^2 + I.z^2)/(I.x^2 + I.z^2)) + * We can simplify this expression a bit by grouping terms: + * + * a = I.x^2 + I.z^2 + * b = sqrt(I.x^2 * (a - t^2)) + * c = I.z*t + a + * N'.z = +-sqrt(0.5*(+-b + c)/a) + * + * Two solutions can immediately be discarded because they're negative so N' would lie in the lower hemisphere. + */ + float Ix = dot(I, X), Iz = dot(I, Ng); + float Ix2 = sqr(Ix), Iz2 = sqr(Iz); + float a = Ix2 + Iz2; + + float b = safe_sqrtf(Ix2*(a - sqr(threshold))); + float c = Iz*threshold + a; + + /* Evaluate both solutions. + * In many cases one can be immediately discarded (if N'.z would be imaginary or larger than one), so check for that first. + * If no option is viable (might happen in extreme cases like N being in the wrong hemisphere), give up and return Ng. */ + float fac = 0.5f/a; + float N1_z2 = fac*(b+c), N2_z2 = fac*(-b+c); + bool valid1 = (N1_z2 > 1e-5f) && (N1_z2 <= (1.0f + 1e-5f)); + bool valid2 = (N2_z2 > 1e-5f) && (N2_z2 <= (1.0f + 1e-5f)); + + float2 N_new; + if(valid1 && valid2) { + /* If both are possible, do the expensive reflection-based check. */ + float2 N1 = make_float2(safe_sqrtf(1.0f - N1_z2), safe_sqrtf(N1_z2)); + float2 N2 = make_float2(safe_sqrtf(1.0f - N2_z2), safe_sqrtf(N2_z2)); + + float R1 = 2*(N1.x*Ix + N1.y*Iz)*N1.y - Iz; + float R2 = 2*(N2.x*Ix + N2.y*Iz)*N2.y - Iz; + + valid1 = (R1 >= 1e-5f); + valid2 = (R2 >= 1e-5f); + if(valid1 && valid2) { + /* If both solutions are valid, return the one with the shallower reflection since it will be closer to the input + * (if the original reflection wasn't shallow, we would not be in this part of the function). */ + N_new = (R1 < R2)? N1 : N2; + } + else { + /* If only one reflection is valid (= positive), pick that one. */ + N_new = (R1 > R2)? N1 : N2; + } - float Nz = safe_sqrtf(0.5f * c * (1.0f / Ix2Iz2)); - float Nx = safe_sqrtf(1.0f - sqr(Nz)); + } + else if(valid1 || valid2) { + /* Only one solution passes the N'.z criterium, so pick that one. */ + float Nz2 = valid1? N1_z2 : N2_z2; + N_new = make_float2(safe_sqrtf(1.0f - Nz2), safe_sqrtf(Nz2)); + } + else { + return Ng; + } - /* Transform back into global coordinates. */ - return Nx*X + Nz*Ng; + return N_new.x*X + N_new.y*Ng; } CCL_NAMESPACE_END -#endif /* __KERNEL_MONTECARLO_CL__ */ +#endif /* __KERNEL_MONTECARLO_CL__ */ diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 458aa6c2a97..80477f921ea 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -14,12 +14,14 @@ * limitations under the License. */ -CCL_NAMESPACE_BEGIN - #if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__) #define __ATOMIC_PASS_WRITE__ #endif +#include "kernel/kernel_id_passes.h" + +CCL_NAMESPACE_BEGIN + ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value) { ccl_global float *buf = buffer; @@ -108,7 +110,7 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_glob float value = path_total_shaded / max(path_total, 1e-7f); kernel_write_pass_float(buffer+2, value*value); } -#endif /* __DENOISING_FEATURES__ */ +#endif /* __DENOISING_FEATURES__ */ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, ShaderData *sd, @@ -187,7 +189,24 @@ ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, L->debug_data.num_ray_bounces); } } -#endif /* __KERNEL_DEBUG__ */ +#endif /* __KERNEL_DEBUG__ */ + +#ifdef __KERNEL_CPU__ +#define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name) +ccl_device_inline size_t kernel_write_id_pass_cpu(float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map) +{ + if(map) { + (*map)[id] += matte_weight; + return 0; + } +#else /* __KERNEL_CPU__ */ +#define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight) +ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer, size_t depth, float id, float matte_weight) +{ +#endif /* __KERNEL_CPU__ */ + kernel_write_id_slots(buffer, depth, id, matte_weight); + return depth * 2; +} ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, ShaderData *sd, ccl_addr_space PathState *state, float3 throughput) @@ -242,6 +261,26 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl } } + if(kernel_data.film.cryptomatte_passes) { + const float matte_weight = average(throughput) * (1.0f - average(shader_bsdf_transparency(kg, sd))); + if(matte_weight > 0.0f) { + ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte; + if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) { + float id = object_cryptomatte_id(kg, sd->object); + cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object); + } + if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) { + float id = shader_cryptomatte_id(kg, sd->shader); + cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material); + } + if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) { + float id = object_cryptomatte_asset_id(kg, sd->object); + cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset); + } + } + } + + if(light_flag & PASSMASK_COMPONENT(DIFFUSE)) L->color_diffuse += shader_bsdf_diffuse(kg, sd)*throughput; if(light_flag & PASSMASK_COMPONENT(GLOSSY)) diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 5745762e183..cb1f410b09f 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -266,7 +266,7 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume( } #endif /* __VOLUME__ */ -#endif /* __SPLIT_KERNEL__ */ +#endif /* __SPLIT_KERNEL__ */ ccl_device_forceinline bool kernel_path_shader_apply( KernelGlobals *kg, @@ -434,7 +434,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, else if(result == VOLUME_PATH_MISSED) { break; } -#endif /* __VOLUME__*/ +#endif /* __VOLUME__*/ /* Shade background. */ if(!hit) { @@ -557,7 +557,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ } -#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */ +#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */ ccl_device_forceinline void kernel_path_integrate( KernelGlobals *kg, @@ -605,7 +605,7 @@ ccl_device_forceinline void kernel_path_integrate( else if(result == VOLUME_PATH_MISSED) { break; } -#endif /* __VOLUME__*/ +#endif /* __VOLUME__*/ /* Shade background. */ if(!hit) { diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index feaea15d3c4..d2506fc1e7e 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light( } } } -#endif /* __EMISSION__ */ +#endif /* __EMISSION__ */ } #ifdef __KERNEL_GPU__ @@ -277,10 +277,10 @@ ccl_device void kernel_branched_path_volume_connect_light( } } } -#endif /* __EMISSION__ */ +#endif /* __EMISSION__ */ } -#endif /* __SPLIT_KERNEL__ */ +#endif /* __SPLIT_KERNEL__ */ -#endif /* __VOLUME_SCATTER__ */ +#endif /* __VOLUME_SCATTER__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h index e32d4bbbc1b..de8cc4a0cef 100644 --- a/intern/cycles/kernel/kernel_queues.h +++ b/intern/cycles/kernel/kernel_queues.h @@ -145,4 +145,4 @@ ccl_device int dequeue_ray_index( CCL_NAMESPACE_END -#endif // __KERNEL_QUEUE_H__ +#endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index b33e4eba8a4..61ddf4a4f81 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -50,7 +50,7 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) return result; } -#endif /* __SOBOL__ */ +#endif /* __SOBOL__ */ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index e834b701f96..af883aa715b 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -1276,4 +1276,9 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect } #endif /* __TRANSPARENT_SHADOWS__ */ +ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader) +{ + return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index 8a0da6c3b13..fafa3ad4bfa 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -446,7 +446,7 @@ ccl_device bool shadow_blocked_transparent_stepped( } # endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */ -#endif /* __TRANSPARENT_SHADOWS__ */ +#endif /* __TRANSPARENT_SHADOWS__ */ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *sd, diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index e93100a6442..864aa7c470a 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -17,6 +17,12 @@ #ifndef __KERNEL_TYPES_H__ #define __KERNEL_TYPES_H__ +#if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE) +# include <embree3/rtcore.h> +# include <embree3/rtcore_scene.h> +# define __EMBREE__ +#endif + #include "kernel/kernel_math.h" #include "kernel/svm/svm_types.h" #include "util/util_static_assert.h" @@ -53,6 +59,7 @@ CCL_NAMESPACE_BEGIN #define OBJECT_NONE (~0) #define PRIM_NONE (~0) #define LAMP_NONE (~0) +#define ID_NONE (0.0f) #define VOLUME_STACK_SIZE 32 @@ -415,6 +422,7 @@ typedef enum PassType { PASS_RAY_BOUNCES, #endif PASS_RENDER_TIME, + PASS_CRYPTOMATTE, PASS_CATEGORY_MAIN_END = 31, PASS_MIST = 32, @@ -443,6 +451,14 @@ typedef enum PassType { #define PASS_ANY (~0) +typedef enum CryptomatteType { + CRYPT_NONE = 0, + CRYPT_OBJECT = (1 << 0), + CRYPT_MATERIAL = (1 << 1), + CRYPT_ASSET = (1 << 2), + CRYPT_ACCURATE = (1 << 3), +} CryptomatteType; + typedef enum DenoisingPassOffsets { DENOISING_PASS_NORMAL = 0, DENOISING_PASS_NORMAL_VAR = 3, @@ -599,7 +615,7 @@ typedef ccl_addr_space struct PathRadiance { #ifdef __KERNEL_DEBUG__ DebugData debug_data; -#endif /* __KERNEL_DEBUG__ */ +#endif /* __KERNEL_DEBUG__ */ } PathRadiance; typedef struct BsdfEval { @@ -712,6 +728,9 @@ typedef struct Ray { /* Intersection */ typedef struct Intersection { +#ifdef __EMBREE__ + float3 Ng; +#endif float t, u, v; int prim; int object; @@ -1260,6 +1279,9 @@ typedef struct KernelFilm { int pass_shadow; float pass_shadow_scale; int filter_table_offset; + int cryptomatte_passes; + int cryptomatte_depth; + int pass_cryptomatte; int pass_mist; float mist_start; @@ -1270,8 +1292,6 @@ typedef struct KernelFilm { int pass_denoising_clean; int denoising_flags; - int pad1, pad2, pad3; - /* XYZ to rendering color space transform. float4 instead of float3 to * ensure consistent padding/alignment across devices. */ float4 xyz_to_r; @@ -1385,20 +1405,29 @@ typedef enum KernelBVHLayout { BVH_LAYOUT_BVH2 = (1 << 0), BVH_LAYOUT_BVH4 = (1 << 1), BVH_LAYOUT_BVH8 = (1 << 2), - + BVH_LAYOUT_EMBREE = (1 << 3), BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH8, BVH_LAYOUT_ALL = (unsigned int)(-1), } KernelBVHLayout; typedef struct KernelBVH { - /* root node */ + /* Own BVH */ int root; int have_motion; int have_curves; int have_instancing; int bvh_layout; int use_bvh_steps; + + /* Embree */ +#ifdef __EMBREE__ + RTCScene scene; +# ifndef __KERNEL_64_BIT__ + int pad1; +# endif +#else int pad1, pad2; +#endif } KernelBVH; static_assert_align(KernelBVH, 16); @@ -1460,7 +1489,11 @@ typedef struct KernelObject { uint patch_map_offset; uint attribute_map_offset; uint motion_offset; - uint pad; + uint pad1; + + float cryptomatte_object; + float cryptomatte_asset; + float pad2, pad3; } KernelObject; static_assert_align(KernelObject, 16); @@ -1540,7 +1573,7 @@ static_assert_align(KernelParticle, 16); typedef struct KernelShader { float constant_emission[3]; - float pad1; + float cryptomatte_id; int flags; int pass_id; int pad2, pad3; @@ -1672,4 +1705,4 @@ typedef struct WorkTile { CCL_NAMESPACE_END -#endif /* __KERNEL_TYPES_H__ */ +#endif /* __KERNEL_TYPES_H__ */ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index d71761a97bc..d6d283c42c5 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -87,7 +87,7 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg, return true; } -#endif /* __VOLUME__ */ +#endif /* __VOLUME__ */ ccl_device float3 volume_color_transmittance(float3 sigma, float t) { @@ -270,7 +270,7 @@ ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput); } -#endif /* __VOLUME__ */ +#endif /* __VOLUME__ */ /* Equi-angular sampling as in: * "Importance Sampling Techniques for Path Tracing in Participating Media" */ @@ -1075,7 +1075,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( return VOLUME_PATH_SCATTERED; } -#endif /* __SPLIT_KERNEL */ +#endif /* __SPLIT_KERNEL */ /* decide if we need to use decoupled or not */ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method) @@ -1377,6 +1377,6 @@ ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg, } } -#endif /* __VOLUME__ */ +#endif /* __VOLUME__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index b62aa9663ec..e036b53b810 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, float *difference_image, float *image, + float *temp_image, float *out_image, float *accum_image, int* rect, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 26777fdabb2..4c758711481 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, float *difference_image, float *image, + float *temp_image, float *out_image, float *accum_image, int *rect, @@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f); + kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f); #endif } diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h index b77b7350d86..ae4fd85780d 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h @@ -26,7 +26,7 @@ template<typename T> struct TextureInterpolator { u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ u[3] = (1.0f / 6.0f) * t * t * t; \ - } (void)0 + } (void) 0 static ccl_always_inline float4 read(float4 r) { @@ -540,4 +540,4 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, CCL_NAMESPACE_END -#endif // __KERNEL_CPU_IMAGE_H__ +#endif // __KERNEL_CPU_IMAGE_H__ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 5ec1655ab05..759b7e4c20d 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -97,7 +97,7 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, { kernel_path_trace(kg, buffer, sample, x, y, offset, stride); } -#endif /* KERNEL_STUB */ +#endif /* KERNEL_STUB */ } /* Film */ @@ -120,7 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, x, y, offset, stride); -#endif /* KERNEL_STUB */ +#endif /* KERNEL_STUB */ } void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, @@ -141,7 +141,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, x, y, offset, stride); -#endif /* KERNEL_STUB */ +#endif /* KERNEL_STUB */ } /* Shader Evaluate */ @@ -176,7 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, else { kernel_background_evaluate(kg, input, output, i); } -#endif /* KERNEL_STUB */ +#endif /* KERNEL_STUB */ } #else /* __SPLIT_KERNEL__ */ @@ -208,7 +208,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, ccl_local type locals; \ kernel_##name(kg, &locals); \ } -#endif /* KERNEL_STUB */ +#endif /* KERNEL_STUB */ DEFINE_SPLIT_KERNEL_FUNCTION(path_init) DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 0561c40e6b1..b856cbde45c 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int channel_offset, float a, @@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, @@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_blur(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_weight(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, difference_image + ofs, image, @@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im int w, int h, int stride, - int shift_stride, + int pass_stride, int r, - int f, - int pass_stride) + int f) { int4 co, rect; int ofs; - if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, difference_image + ofs, diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index 8a180a509e8..af311027f78 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -40,14 +40,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_path_trace(WorkTile *tile, uint total_work_size) { int work_index = ccl_global_id(0); - - if(work_index < total_work_size) { - uint x, y, sample; + bool thread_is_active = work_index < total_work_size; + uint x, y, sample; + KernelGlobals kg; + if(thread_is_active) { get_work_pixel(tile, work_index, &x, &y, &sample); - KernelGlobals kg; kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); } + + if(kernel_data.film.cryptomatte_passes) { + __syncthreads(); + if(thread_is_active) { + kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); + } + } } #ifdef __BRANCHED_PATH__ @@ -56,14 +63,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS) kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size) { int work_index = ccl_global_id(0); - - if(work_index < total_work_size) { - uint x, y, sample; + bool thread_is_active = work_index < total_work_size; + uint x, y, sample; + KernelGlobals kg; + if(thread_is_active) { get_work_pixel(tile, work_index, &x, &y, &sample); - KernelGlobals kg; kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); } + + if(kernel_data.film.cryptomatte_passes) { + __syncthreads(); + if(thread_is_active) { + kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); + } + } } #endif diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index 3c75754fb39..a550f97f4eb 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int channel_offset, float a, @@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, @@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_blur(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_calc_weight(co.x, co.y, difference_image + ofs, out_image + ofs, @@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re int w, int h, int stride, - int shift_stride, + int pass_stride, int r, int f) { int4 co, rect; int ofs; - if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, difference_image + ofs, image, @@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc int w, int h, int stride, - int shift_stride, + int pass_stride, int r, - int f, - int pass_stride) + int f) { int4 co, rect; int ofs; - if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { kernel_filter_nlm_construct_gramian(co.x, co.y, co.z, co.w, difference_image + ofs, diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index 63128d0aecf..de1f5088629 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -66,9 +66,17 @@ __kernel void kernel_ocl_path_trace( int x = sx + ccl_global_id(0); int y = sy + ccl_global_id(1); - - if(x < sx + sw && y < sy + sh) + bool thread_is_active = x < sx + sw && y < sy + sh; + if(thread_is_active) { kernel_path_trace(kg, buffer, sample, x, y, offset, stride); + } + if(kernel_data.film.cryptomatte_passes) { + /* Make sure no thread is writing to the buffers. */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(thread_is_active) { + kernel_cryptomatte_post(kg, buffer, sample, x, y, offset, stride); + } + } } #else /* __COMPILE_ONLY_MEGAKERNEL__ */ diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h index dd9d683e030..79af831c2fb 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h +++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h @@ -142,7 +142,7 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix) u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ u[3] = (1.0f / 6.0f) * t * t * t; \ - } (void)0 + } (void) 0 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index d9aeb9ab9fb..2a50704b569 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -146,4 +146,4 @@ CCLOSURE_PREPARE_STATIC(bsdf_##lower##_prepare, Upper##Closure) CCL_NAMESPACE_END -#endif /* __OSL_CLOSURES_H__ */ +#endif /* __OSL_CLOSURES_H__ */ diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h index 30b29793e2d..88192fbcccb 100644 --- a/intern/cycles/kernel/osl/osl_globals.h +++ b/intern/cycles/kernel/osl/osl_globals.h @@ -94,4 +94,4 @@ CCL_NAMESPACE_END #endif -#endif /* __OSL_GLOBALS_H__ */ +#endif /* __OSL_GLOBALS_H__ */ diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 7902381440b..97f97a4887e 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -884,6 +884,23 @@ bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlo return false; /* never called by OSL */ } +TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring filename) +{ + if(filename.length() && filename[0] == '@') { + /* Dummy, we don't use texture handles for builtin textures but need + * to tell the OSL runtime optimizer that this is a valid texture. */ + return NULL; + } + else { + return texturesys()->get_texture_handle(filename); + } +} + +bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle) +{ + return texturesys()->good(texture_handle); +} + bool OSLRenderServices::texture(ustring filename, TextureHandle *texture_handle, TexturePerthread *texture_thread_info, @@ -894,7 +911,8 @@ bool OSLRenderServices::texture(ustring filename, int nchannels, float *result, float *dresultds, - float *dresultdt) + float *dresultdt, + ustring *errormessage) { OSL::TextureSystem *ts = osl_ts; ShaderData *sd = (ShaderData *)(sg->renderstate); @@ -1035,7 +1053,7 @@ bool OSLRenderServices::texture(ustring filename, * other nasty stuff happening. */ string err = ts->geterror(); - (void)err; + (void) err; } return status; @@ -1114,7 +1132,7 @@ bool OSLRenderServices::texture3d(ustring filename, * other nasty stuff happening. */ string err = ts->geterror(); - (void)err; + (void) err; } return status; @@ -1156,7 +1174,13 @@ bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg, ustring filenam TypeDesc datatype, void *data) { OSL::TextureSystem *ts = osl_ts; - return ts->get_texture_info(filename, subimage, dataname, datatype, data); + if(filename.length() && filename[0] == '@') { + /* Special builtin textures. */ + return false; + } + else { + return ts->get_texture_info(filename, subimage, dataname, datatype, data); + } } int OSLRenderServices::pointcloud_search(OSL::ShaderGlobals *sg, ustring filename, const OSL::Vec3 ¢er, diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h index 50044746fd1..712b06b41b8 100644 --- a/intern/cycles/kernel/osl/osl_services.h +++ b/intern/cycles/kernel/osl/osl_services.h @@ -93,6 +93,10 @@ public: bool getmessage(OSL::ShaderGlobals *sg, ustring source, ustring name, TypeDesc type, void *val, bool derivatives); + TextureSystem::TextureHandle *get_texture_handle(ustring filename); + + bool good(TextureSystem::TextureHandle *texture_handle); + bool texture(ustring filename, TextureSystem::TextureHandle *texture_handle, TexturePerthread *texture_thread_info, @@ -103,7 +107,8 @@ public: int nchannels, float *result, float *dresultds, - float *dresultdt); + float *dresultdt, + ustring *errormessage); bool texture3d(ustring filename, TextureHandle *texture_handle, @@ -194,4 +199,4 @@ private: CCL_NAMESPACE_END -#endif /* __OSL_SERVICES_H__ */ +#endif /* __OSL_SERVICES_H__ */ diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 6a690e880ad..a89bb3fd1a3 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -193,7 +193,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state float data[9]; bool found = kg->osl->services->get_attribute(sd, true, OSLRenderServices::u_empty, TypeDesc::TypeVector, OSLRenderServices::u_geom_undisplaced, data); - (void)found; + (void) found; assert(found); memcpy(&sd->P, data, sizeof(float)*3); diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h index 571a3f502be..9824f966a44 100644 --- a/intern/cycles/kernel/osl/osl_shader.h +++ b/intern/cycles/kernel/osl/osl_shader.h @@ -66,4 +66,4 @@ CCL_NAMESPACE_END #endif -#endif /* __OSL_SHADER_H__ */ +#endif /* __OSL_SHADER_H__ */ diff --git a/intern/cycles/kernel/shaders/oslutil.h b/intern/cycles/kernel/shaders/oslutil.h index 141e5d27e3a..592a8ad12d9 100644 --- a/intern/cycles/kernel/shaders/oslutil.h +++ b/intern/cycles/kernel/shaders/oslutil.h @@ -92,4 +92,4 @@ float wireframe(string edge_type, float line_width) { return wireframe(edge_type float wireframe(string edge_type) { return wireframe(edge_type, 1.0, 1); } float wireframe() { return wireframe("polygons", 1.0, 1); } -#endif /* CCL_OSLUTIL_H */ +#endif /* CCL_OSLUTIL_H */ diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h index 4a8378796ba..7136c746321 100644 --- a/intern/cycles/kernel/shaders/stdosl.h +++ b/intern/cycles/kernel/shaders/stdosl.h @@ -284,33 +284,63 @@ point rotate (point p, float angle, point a, point b) normal ensure_valid_reflection(normal Ng, vector I, normal N) { + /* The implementation here mirrors the one in kernel_montecarlo.h, + * check there for an explanation of the algorithm. */ + float sqr(float x) { return x*x; } vector R = 2*dot(N, I)*N - I; - if (dot(Ng, R) >= 0.05) { + + float threshold = min(0.9*dot(Ng, I), 0.01); + if(dot(Ng, R) >= threshold) { return N; } - /* Form coordinate system with Ng as the Z axis and N inside the X-Z-plane. - * The X axis is found by normalizing the component of N that's orthogonal to Ng. - * The Y axis isn't actually needed. - */ - vector X = normalize(N - dot(N, Ng)*Ng); + float NdotNg = dot(N, Ng); + vector X = normalize(N - NdotNg*Ng); - /* Calculate N.z and N.x in the local coordinate system. */ float Ix = dot(I, X), Iz = dot(I, Ng); - float Ix2 = sqr(dot(I, X)), Iz2 = sqr(dot(I, Ng)); - float Ix2Iz2 = Ix2 + Iz2; - - float a = sqrt(Ix2*(Ix2Iz2 - sqr(0.05))); - float b = Iz*0.05 + Ix2Iz2; - float c = (a + b > 0.0)? (a + b) : (-a + b); + float Ix2 = sqr(Ix), Iz2 = sqr(Iz); + float a = Ix2 + Iz2; + + float b = sqrt(Ix2*(a - sqr(threshold))); + float c = Iz*threshold + a; + + float fac = 0.5/a; + float N1_z2 = fac*(b+c), N2_z2 = fac*(-b+c); + int valid1 = (N1_z2 > 1e-5) && (N1_z2 <= (1.0 + 1e-5)); + int valid2 = (N2_z2 > 1e-5) && (N2_z2 <= (1.0 + 1e-5)); + + float N_new_x, N_new_z; + if(valid1 && valid2) { + float N1_x = sqrt(1.0 - N1_z2), N1_z = sqrt(N1_z2); + float N2_x = sqrt(1.0 - N2_z2), N2_z = sqrt(N2_z2); + + float R1 = 2*(N1_x*Ix + N1_z*Iz)*N1_z - Iz; + float R2 = 2*(N2_x*Ix + N2_z*Iz)*N2_z - Iz; + + valid1 = (R1 >= 1e-5); + valid2 = (R2 >= 1e-5); + if(valid1 && valid2) { + N_new_x = (R1 < R2)? N1_x : N2_x; + N_new_z = (R1 < R2)? N1_z : N2_z; + } + else { + N_new_x = (R1 > R2)? N1_x : N2_x; + N_new_z = (R1 > R2)? N1_z : N2_z; + } - float Nz = sqrt(0.5 * c * (1.0 / Ix2Iz2)); - float Nx = sqrt(1.0 - sqr(Nz)); + } + else if(valid1 || valid2) { + float Nz2 = valid1? N1_z2 : N2_z2; + N_new_x = sqrt(1.0 - Nz2); + N_new_z = sqrt(Nz2); + } + else { + return Ng; + } - /* Transform back into global coordinates. */ - return Nx*X + Nz*Ng; + return N_new_x*X + N_new_z*Ng; } @@ -485,7 +515,7 @@ float smooth_linearstep (float edge0, float edge1, float x_, float eps_) { else if (x >= eps && x <= 1.0-eps) result = x; else if (x >= 1.0+eps) result = 1; else if (x < eps) result = rampup (x+eps, 2.0*eps); - else /* if (x < 1.0+eps) */ result = 1.0 - rampup (1.0+eps - x, 2.0*eps); + else /* if (x < 1.0+eps) */ result = 1.0 - rampup (1.0+eps - x, 2.0*eps); } else { result = step (edge0, x_); } @@ -656,4 +686,4 @@ int getmatrix (string fromspace, output matrix M) { #undef PERCOMP2 #undef PERCOMP2F -#endif /* CCL_STDOSL_H */ +#endif /* CCL_STDOSL_H */ diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 180c0b57077..18eec6372f1 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -80,8 +80,10 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + bool ray_was_updated = false; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + ray_was_updated = true; uint sample = state->sample; uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; @@ -92,6 +94,17 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); } + if(kernel_data.film.cryptomatte_passes) { + /* Make sure no thread is writing to the buffers. */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) { + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte; + kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth); + } + } + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { /* We have completed current work; So get next work */ ccl_global uint *work_pools = kernel_split_params.work_pools; diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h index 2132c42220f..666355de334 100644 --- a/intern/cycles/kernel/split/kernel_shader_sort.h +++ b/intern/cycles/kernel/split/kernel_shader_sort.h @@ -78,7 +78,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, } } } -# endif /* __KERNEL_OPENCL__ */ +# endif /* __KERNEL_OPENCL__ */ /* copy to destination */ for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { @@ -91,7 +91,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg, kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini]; } } -#endif /* __KERNEL_CUDA__ */ +#endif /* __KERNEL_CUDA__ */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h index 9297e1e0ad5..3f6b3977d79 100644 --- a/intern/cycles/kernel/split/kernel_split_data.h +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements) { - (void)kg; /* Unused on CPU. */ + (void) kg; /* Unused on CPU. */ uint64_t size = 0; #define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16) @@ -48,7 +48,7 @@ ccl_device_inline void split_data_init(KernelGlobals *kg, ccl_global void *data, ccl_global char *ray_state) { - (void)kg; /* Unused on CPU. */ + (void) kg; /* Unused on CPU. */ ccl_global char *p = (ccl_global char*)data; diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index 56194d9f857..83df1e2a0a6 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -86,14 +86,14 @@ typedef ccl_global struct SplitBranchedState { SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1) #else # define SPLIT_DATA_SUBSURFACE_ENTRIES -#endif /* __SUBSURFACE__ */ +#endif /* __SUBSURFACE__ */ #ifdef __VOLUME__ # define SPLIT_DATA_VOLUME_ENTRIES \ SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1) #else # define SPLIT_DATA_VOLUME_ENTRIES -#endif /* __VOLUME__ */ +#endif /* __VOLUME__ */ #define SPLIT_DATA_ENTRIES \ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index ab69afa051e..ccb9aef7a5b 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -313,7 +313,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a case NODE_LEAVE_BUMP_EVAL: svm_node_leave_bump_eval(kg, sd, stack, node.y); break; -# endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */ +# endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */ # endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */ case NODE_HSV: svm_node_hsv(kg, sd, stack, node, &offset); @@ -497,4 +497,4 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a CCL_NAMESPACE_END -#endif /* __SVM_H__ */ +#endif /* __SVM_H__ */ diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 64bf8244999..3cf33f4d431 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -262,7 +262,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ? (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)) : NULL; - if (bsdf && extra) { + if(bsdf && extra) { bsdf->N = N; bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f; bsdf->T = T; @@ -285,7 +285,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* setup bsdf */ if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */ sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd); - else /* use multi-scatter GGX */ + else /* use multi-scatter GGX */ sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd); } } @@ -314,7 +314,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ? (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)) : NULL; - if (bsdf && extra) { + if(bsdf && extra) { bsdf->N = N; bsdf->T = make_float3(0.0f, 0.0f, 0.0f); bsdf->extra = extra; diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h index 27127b85323..41538d1138d 100644 --- a/intern/cycles/kernel/svm/svm_hsv.h +++ b/intern/cycles/kernel/svm/svm_hsv.h @@ -59,4 +59,4 @@ ccl_device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, ui CCL_NAMESPACE_END -#endif /* __SVM_HSV_H__ */ +#endif /* __SVM_HSV_H__ */ diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h index a3e4b6e87cd..6f39391057e 100644 --- a/intern/cycles/kernel/svm/svm_ramp.h +++ b/intern/cycles/kernel/svm/svm_ramp.h @@ -108,4 +108,4 @@ ccl_device void svm_node_curves(KernelGlobals *kg, ShaderData *sd, float *stack, CCL_NAMESPACE_END -#endif /* __SVM_RAMP_H__ */ +#endif /* __SVM_RAMP_H__ */ diff --git a/intern/cycles/kernel/svm/svm_ramp_util.h b/intern/cycles/kernel/svm/svm_ramp_util.h index a67689ff9d1..847108ff1c2 100644 --- a/intern/cycles/kernel/svm/svm_ramp_util.h +++ b/intern/cycles/kernel/svm/svm_ramp_util.h @@ -95,4 +95,4 @@ ccl_device float float_ramp_lookup(const float *ramp, CCL_NAMESPACE_END -#endif /* __SVM_RAMP_UTIL_H__ */ +#endif /* __SVM_RAMP_UTIL_H__ */ diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index 910537a2539..0f1dfa4936b 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -531,4 +531,4 @@ typedef enum ClosureType { CCL_NAMESPACE_END -#endif /* __SVM_TYPES_H__ */ +#endif /* __SVM_TYPES_H__ */ diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h index 7b60ab6e6ae..80b63dc80cd 100644 --- a/intern/cycles/kernel/svm/svm_wave.h +++ b/intern/cycles/kernel/svm/svm_wave.h @@ -24,7 +24,7 @@ ccl_device_noinline float svm_wave(NodeWaveType type, NodeWaveProfile profile, f if(type == NODE_WAVE_BANDS) n = (p.x + p.y + p.z) * 10.0f; - else /* NODE_WAVE_RINGS */ + else /* NODE_WAVE_RINGS */ n = len(p) * 20.0f; if(distortion != 0.0f) diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt index 7d2220f37f9..c0ce7368771 100644 --- a/intern/cycles/render/CMakeLists.txt +++ b/intern/cycles/render/CMakeLists.txt @@ -15,6 +15,7 @@ set(SRC buffers.cpp camera.cpp constant_fold.cpp + coverage.cpp film.cpp graph.cpp image.cpp @@ -46,6 +47,7 @@ set(SRC_HEADERS buffers.h camera.h constant_fold.h + coverage.h film.h graph.h image.h diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h index 40e5be2e1b2..e7438f4513d 100644 --- a/intern/cycles/render/attribute.h +++ b/intern/cycles/render/attribute.h @@ -172,4 +172,4 @@ public: CCL_NAMESPACE_END -#endif /* __ATTRIBUTE_H__ */ +#endif /* __ATTRIBUTE_H__ */ diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h index 3f56dedb2c8..17c3eaaaaf5 100644 --- a/intern/cycles/render/background.h +++ b/intern/cycles/render/background.h @@ -59,4 +59,4 @@ public: CCL_NAMESPACE_END -#endif /* __BACKGROUND_H__ */ +#endif /* __BACKGROUND_H__ */ diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h index a811eac3327..fce8f2fa606 100644 --- a/intern/cycles/render/bake.h +++ b/intern/cycles/render/bake.h @@ -83,4 +83,4 @@ private: CCL_NAMESPACE_END -#endif /* __BAKE_H__ */ +#endif /* __BAKE_H__ */ diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index 4cd8b3726d3..f901885e679 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -147,7 +147,7 @@ bool RenderBuffers::copy_from_device() return true; } -bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels) +bool RenderBuffers::get_denoising_pass_rect(int type, float exposure, int sample, int components, float *pixels) { if(buffer.data() == NULL) { return false; @@ -155,19 +155,20 @@ bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int samp float invsample = 1.0f/sample; float scale = invsample; - bool variance = (offset == DENOISING_PASS_NORMAL_VAR) || - (offset == DENOISING_PASS_ALBEDO_VAR) || - (offset == DENOISING_PASS_DEPTH_VAR) || - (offset == DENOISING_PASS_COLOR_VAR); - - if(offset == DENOISING_PASS_COLOR || offset == DENOISING_PASS_CLEAN) { - scale *= exposure; + bool variance = (type == DENOISING_PASS_NORMAL_VAR) || + (type == DENOISING_PASS_ALBEDO_VAR) || + (type == DENOISING_PASS_DEPTH_VAR) || + (type == DENOISING_PASS_COLOR_VAR); + + float scale_exposure = scale; + if(type == DENOISING_PASS_COLOR || type == DENOISING_PASS_CLEAN) { + scale_exposure *= exposure; } - else if(offset == DENOISING_PASS_COLOR_VAR) { - scale *= exposure*exposure; + else if(type == DENOISING_PASS_COLOR_VAR) { + scale_exposure *= exposure*exposure; } - offset += params.get_denoising_offset(); + int offset = type + params.get_denoising_offset(); int pass_stride = params.get_passes_size(); int size = params.width*params.height; @@ -181,14 +182,14 @@ bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int samp if(components == 1) { for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels++) { - pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale; + pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure; } } else if(components == 3) { for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels += 3) { - pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale; - pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale; - pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale; + pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale_exposure; + pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale_exposure; + pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale_exposure; } } else { @@ -200,14 +201,28 @@ bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int samp if(components == 1) { for(int i = 0; i < size; i++, in += pass_stride, pixels++) { - pixels[0] = in[0]*scale; + pixels[0] = in[0]*scale_exposure; } } else if(components == 3) { for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { - pixels[0] = in[0]*scale; - pixels[1] = in[1]*scale; - pixels[2] = in[2]*scale; + pixels[0] = in[0]*scale_exposure; + pixels[1] = in[1]*scale_exposure; + pixels[2] = in[2]*scale_exposure; + } + } + else if(components == 4) { + assert(type == DENOISING_PASS_COLOR); + + /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */ + assert(params.passes[0].type == PASS_COMBINED); + float *in_combined = buffer.data(); + + for(int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) { + pixels[0] = in[0]*scale_exposure; + pixels[1] = in[1]*scale_exposure; + pixels[2] = in[2]*scale_exposure; + pixels[3] = saturate(in_combined[3]*scale); } } else { @@ -218,7 +233,7 @@ bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int samp return true; } -bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels) +bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels, const string &name) { if(buffer.data() == NULL) { return false; @@ -234,6 +249,14 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int continue; } + /* Tell Cryptomatte passes apart by their name. */ + if(pass.type == PASS_CRYPTOMATTE) { + if(pass.name != name) { + pass_offset += pass.components; + continue; + } + } + float *in = buffer.data() + pass_offset; int pass_stride = params.get_passes_size(); @@ -370,6 +393,17 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int pixels[3] = f.w*invw; } } + else if(type == PASS_CRYPTOMATTE) { + for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) { + float4 f = make_float4(in[0], in[1], in[2], in[3]); + /* x and z contain integer IDs, don't rescale them. + y and w contain matte weights, they get scaled. */ + pixels[0] = f.x; + pixels[1] = f.y * scale; + pixels[2] = f.z; + pixels[3] = f.w * scale; + } + } else { for(int i = 0; i < size; i++, in += pass_stride, pixels += 4) { float4 f = make_float4(in[0], in[1], in[2], in[3]); diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index 1b06ffe33a6..46c3b89bd84 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -50,7 +50,7 @@ public: int full_height; /* passes */ - array<Pass> passes; + vector<Pass> passes; bool denoising_data_pass; /* If only some light path types should be denoised, an additional pass is needed. */ bool denoising_clean_pass; @@ -84,7 +84,7 @@ public: void zero(); bool copy_from_device(); - bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels); + bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels, const string &name); bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels); }; @@ -146,4 +146,4 @@ public: CCL_NAMESPACE_END -#endif /* __BUFFERS_H__ */ +#endif /* __BUFFERS_H__ */ diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp index ec3c56e820a..34066e1b024 100644 --- a/intern/cycles/render/camera.cpp +++ b/intern/cycles/render/camera.cpp @@ -716,7 +716,7 @@ float Camera::world_to_raster_size(float3 P) float3 D = transform_point(&worldtocamera, P); float dist = len(D); - Ray ray = {0}; + Ray ray = {{0}}; /* Distortion can become so great that the results become meaningless, there * may be a better way to do this, but calculating differentials from the diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h index 323f2c61ca4..37f5dea624f 100644 --- a/intern/cycles/render/camera.h +++ b/intern/cycles/render/camera.h @@ -21,6 +21,7 @@ #include "graph/node.h" +#include "util/util_array.h" #include "util/util_boundbox.h" #include "util/util_projection.h" #include "util/util_transform.h" @@ -212,4 +213,4 @@ private: CCL_NAMESPACE_END -#endif /* __CAMERA_H__ */ +#endif /* __CAMERA_H__ */ diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h index 26fa4e8b1c8..6ec94b055e3 100644 --- a/intern/cycles/render/constant_fold.h +++ b/intern/cycles/render/constant_fold.h @@ -70,4 +70,4 @@ public: CCL_NAMESPACE_END -#endif /* __CONSTANT_FOLD_H__ */ +#endif /* __CONSTANT_FOLD_H__ */ diff --git a/intern/cycles/render/coverage.cpp b/intern/cycles/render/coverage.cpp new file mode 100644 index 00000000000..72ef4cda3ff --- /dev/null +++ b/intern/cycles/render/coverage.cpp @@ -0,0 +1,143 @@ +/* + * Copyright 2018 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "render/coverage.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_id_passes.h" +#include "kernel/kernel_types.h" +#include "util/util_map.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +static bool crypomatte_comp(const pair<float, float>& i, const pair<float, float> j) { return i.first > j.first; } + +void Coverage::finalize() +{ + int pass_offset = 0; + if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) { + finalize_buffer(coverage_object, pass_offset); + pass_offset += kernel_data.film.cryptomatte_depth * 4; + } + if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) { + finalize_buffer(coverage_material, pass_offset); + pass_offset += kernel_data.film.cryptomatte_depth * 4; + } + if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) { + finalize_buffer(coverage_asset, pass_offset); + } +} + +void Coverage::init_path_trace() +{ + kg->coverage_object = kg->coverage_material = kg->coverage_asset = NULL; + + if(kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) { + if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) { + coverage_object.clear(); + coverage_object.resize(tile.w * tile.h); + } + if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) { + coverage_material.clear(); + coverage_material.resize(tile.w * tile.h); + } + if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) { + coverage_asset.clear(); + coverage_asset.resize(tile.w * tile.h); + } + } +} + +void Coverage::init_pixel(int x, int y) +{ + if(kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) { + const int pixel_index = tile.w * (y - tile.y) + x - tile.x; + if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) { + kg->coverage_object = &coverage_object[pixel_index]; + } + if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) { + kg->coverage_material = &coverage_material[pixel_index]; + } + if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) { + kg->coverage_asset = &coverage_asset[pixel_index]; + } + } +} + +void Coverage::finalize_buffer(vector<CoverageMap> & coverage, const int pass_offset) +{ + if(kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) { + flatten_buffer(coverage, pass_offset); + } + else { + sort_buffer(pass_offset); + } +} + +void Coverage::flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset) +{ + /* Sort the coverage map and write it to the output */ + int pixel_index = 0; + int pass_stride = tile.buffers->params.get_passes_size(); + for(int y = 0; y < tile.h; ++y) { + for(int x = 0; x < tile.w; ++x) { + const CoverageMap& pixel = coverage[pixel_index]; + if(!pixel.empty()) { + /* buffer offset */ + int index = x + y * tile.stride; + float *buffer = (float*)tile.buffer + index*pass_stride; + + /* sort the cryptomatte pixel */ + vector<pair<float, float> > sorted_pixel; + for(CoverageMap::const_iterator it = pixel.begin(); it != pixel.end(); ++it) { + sorted_pixel.push_back(std::make_pair(it->second, it->first)); + } + sort(sorted_pixel.begin(), sorted_pixel.end(), crypomatte_comp); + int num_slots = 2 * (kernel_data.film.cryptomatte_depth); + if(sorted_pixel.size() > num_slots) { + float leftover = 0.0f; + for(vector<pair<float, float> >::iterator it = sorted_pixel.begin()+num_slots; it != sorted_pixel.end(); ++it) { + leftover += it->first; + } + sorted_pixel[num_slots-1].first += leftover; + } + int limit = min(num_slots, sorted_pixel.size()); + for(int i = 0; i < limit; ++i) { + kernel_write_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset, 2 * (kernel_data.film.cryptomatte_depth), sorted_pixel[i].second, sorted_pixel[i].first); + } + } + ++pixel_index; + } + } +} + +void Coverage::sort_buffer(const int pass_offset) +{ + /* Sort the coverage map and write it to the output */ + int pass_stride = tile.buffers->params.get_passes_size(); + for(int y = 0; y < tile.h; ++y) { + for(int x = 0; x < tile.w; ++x) { + /* buffer offset */ + int index = x + y*tile.stride; + float *buffer = (float*)tile.buffer + index*pass_stride; + kernel_sort_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset, 2 * (kernel_data.film.cryptomatte_depth)); + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/coverage.h b/intern/cycles/render/coverage.h new file mode 100644 index 00000000000..9ee0bce7517 --- /dev/null +++ b/intern/cycles/render/coverage.h @@ -0,0 +1,49 @@ +/* + * Copyright 2018 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "render/buffers.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data.h" +#include "kernel/kernel_globals.h" +#include "util/util_map.h" +#include "util/util_vector.h" + +#ifndef __COVERAGE_H__ +#define __COVERAGE_H__ + +CCL_NAMESPACE_BEGIN + +class Coverage { +public: + Coverage(KernelGlobals *kg_, RenderTile &tile_) : kg(kg_), tile(tile_) { } + void init_path_trace(); + void init_pixel(int x, int y); + void finalize(); +private: + vector<CoverageMap>coverage_object; + vector<CoverageMap>coverage_material; + vector<CoverageMap>coverage_asset; + KernelGlobals *kg; + RenderTile &tile; + void finalize_buffer(vector<CoverageMap>&coverage, const int pass_offset); + void flatten_buffer(vector<CoverageMap>&coverage, const int pass_offset); + void sort_buffer(const int pass_offset); +}; + + +CCL_NAMESPACE_END + +#endif /* __COVERAGE_H__ */ diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h index 62066d8a809..cf75751c58f 100644 --- a/intern/cycles/render/curves.h +++ b/intern/cycles/render/curves.h @@ -17,8 +17,8 @@ #ifndef __CURVES_H__ #define __CURVES_H__ +#include "util/util_array.h" #include "util/util_types.h" -#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -119,4 +119,4 @@ public: CCL_NAMESPACE_END -#endif /* __CURVES_H__ */ +#endif /* __CURVES_H__ */ diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 8f3596ade58..d0f15496e50 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -38,11 +38,14 @@ static bool compare_pass_order(const Pass& a, const Pass& b) return (a.components > b.components); } -void Pass::add(PassType type, array<Pass>& passes) +void Pass::add(PassType type, vector<Pass>& passes, const char *name) { - for(size_t i = 0; i < passes.size(); i++) - if(passes[i].type == type) + for(size_t i = 0; i < passes.size(); i++) { + if(passes[i].type == type && + (name ? (passes[i].name == name) : passes[i].name.empty())) { return; + } + } Pass pass; @@ -50,6 +53,9 @@ void Pass::add(PassType type, array<Pass>& passes) pass.filter = true; pass.exposure = false; pass.divide_type = PASS_NONE; + if(name) { + pass.name = name; + } switch(type) { case PASS_NONE: @@ -155,13 +161,15 @@ void Pass::add(PassType type, array<Pass>& passes) pass.components = 4; pass.exposure = true; break; - + case PASS_CRYPTOMATTE: + pass.components = 4; + break; default: assert(false); break; } - passes.push_back_slow(pass); + passes.push_back(pass); /* order from by components, to ensure alignment so passes with size 4 * come first and then passes with size 1 */ @@ -171,19 +179,19 @@ void Pass::add(PassType type, array<Pass>& passes) Pass::add(pass.divide_type, passes); } -bool Pass::equals(const array<Pass>& A, const array<Pass>& B) +bool Pass::equals(const vector<Pass>& A, const vector<Pass>& B) { if(A.size() != B.size()) return false; for(int i = 0; i < A.size(); i++) - if(A[i].type != B[i].type) + if(A[i].type != B[i].type || A[i].name != B[i].name) return false; return true; } -bool Pass::contains(const array<Pass>& passes, PassType type) +bool Pass::contains(const vector<Pass>& passes, PassType type) { for(size_t i = 0; i < passes.size(); i++) if(passes[i].type == type) @@ -290,6 +298,7 @@ Film::Film() use_light_visibility = false; filter_table_offset = TABLE_OFFSET_INVALID; + cryptomatte_passes = CRYPT_NONE; need_update = true; } @@ -314,6 +323,8 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_stride = 0; kfilm->use_light_pass = use_light_visibility || use_sample_clamp; + bool have_cryptomatte = false; + for(size_t i = 0; i < passes.size(); i++) { Pass& pass = passes[i]; @@ -434,7 +445,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) #endif case PASS_RENDER_TIME: break; - + case PASS_CRYPTOMATTE: + kfilm->pass_cryptomatte = have_cryptomatte ? min(kfilm->pass_cryptomatte, kfilm->pass_stride) : kfilm->pass_stride; + have_cryptomatte = true; + break; default: assert(false); break; @@ -471,6 +485,9 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f; kfilm->mist_falloff = mist_falloff; + kfilm->cryptomatte_passes = cryptomatte_passes; + kfilm->cryptomatte_depth = cryptomatte_depth; + pass_stride = kfilm->pass_stride; denoising_data_offset = kfilm->pass_denoising_data; denoising_clean_offset = kfilm->pass_denoising_clean; @@ -490,7 +507,7 @@ bool Film::modified(const Film& film) return !Node::equals(film) || !Pass::equals(passes, film.passes); } -void Film::tag_passes_update(Scene *scene, const array<Pass>& passes_) +void Film::tag_passes_update(Scene *scene, const vector<Pass>& passes_) { if(Pass::contains(passes, PASS_UV) != Pass::contains(passes_, PASS_UV)) { scene->mesh_manager->tag_update(scene); diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index 6ab2eea79b8..c597db4e4c5 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -45,10 +45,11 @@ public: bool filter; bool exposure; PassType divide_type; + string name; - static void add(PassType type, array<Pass>& passes); - static bool equals(const array<Pass>& A, const array<Pass>& B); - static bool contains(const array<Pass>& passes, PassType); + static void add(PassType type, vector<Pass>& passes, const char* name = NULL); + static bool equals(const vector<Pass>& A, const vector<Pass>& B); + static bool contains(const vector<Pass>& passes, PassType); }; class Film : public Node { @@ -56,7 +57,7 @@ public: NODE_DECLARE float exposure; - array<Pass> passes; + vector<Pass> passes; bool denoising_data_pass; bool denoising_clean_pass; int denoising_flags; @@ -76,6 +77,8 @@ public: bool use_light_visibility; bool use_sample_clamp; + CryptomatteType cryptomatte_passes; + int cryptomatte_depth; bool need_update; @@ -86,10 +89,10 @@ public: void device_free(Device *device, DeviceScene *dscene, Scene *scene); bool modified(const Film& film); - void tag_passes_update(Scene *scene, const array<Pass>& passes_); + void tag_passes_update(Scene *scene, const vector<Pass>& passes_); void tag_update(Scene *scene); }; CCL_NAMESPACE_END -#endif /* __FILM_H__ */ +#endif /* __FILM_H__ */ diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h index 426522066b3..d14a59b4900 100644 --- a/intern/cycles/render/graph.h +++ b/intern/cycles/render/graph.h @@ -293,4 +293,4 @@ protected: CCL_NAMESPACE_END -#endif /* __GRAPH_H__ */ +#endif /* __GRAPH_H__ */ diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index d94ebe564e3..8367a6811bd 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -164,4 +164,4 @@ private: CCL_NAMESPACE_END -#endif /* __IMAGE_H__ */ +#endif /* __IMAGE_H__ */ diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index f68400ac416..6a7e2056851 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -94,4 +94,4 @@ public: CCL_NAMESPACE_END -#endif /* __INTEGRATOR_H__ */ +#endif /* __INTEGRATOR_H__ */ diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h index 32a911dc256..f4dfe0cadbf 100644 --- a/intern/cycles/render/light.h +++ b/intern/cycles/render/light.h @@ -139,4 +139,4 @@ protected: CCL_NAMESPACE_END -#endif /* __LIGHT_H__ */ +#endif /* __LIGHT_H__ */ diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index 8a00b88af12..5f884a3f871 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -39,6 +39,10 @@ #include "util/util_progress.h" #include "util/util_set.h" +#ifdef WITH_EMBREE +# include "bvh/bvh_embree.h" +#endif + CCL_NAMESPACE_BEGIN /* Triangle */ @@ -1068,11 +1072,14 @@ void Mesh::compute_bvh(Device *device, bparams.use_spatial_split = params->use_bvh_spatial_split; bparams.bvh_layout = BVHParams::best_bvh_layout( params->bvh_layout, - device->info.bvh_layout_mask); + device->get_bvh_layout_mask()); bparams.use_unaligned_nodes = dscene->data.bvh.have_curves && params->use_bvh_unaligned_nodes; bparams.num_motion_triangle_steps = params->num_bvh_time_steps; bparams.num_motion_curve_steps = params->num_bvh_time_steps; + bparams.bvh_type = params->bvh_type; + bparams.curve_flags = dscene->data.curve.curveflags; + bparams.curve_subdivisions = dscene->data.curve.subdivisions; delete bvh; bvh = BVH::create(bparams, objects); @@ -1284,9 +1291,9 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att } } #else - (void)device; - (void)scene; - (void)mesh_attributes; + (void) device; + (void) scene; + (void) mesh_attributes; #endif } @@ -1855,20 +1862,38 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene * bparams.top_level = true; bparams.bvh_layout = BVHParams::best_bvh_layout( scene->params.bvh_layout, - device->info.bvh_layout_mask); + device->get_bvh_layout_mask()); bparams.use_spatial_split = scene->params.use_bvh_spatial_split; bparams.use_unaligned_nodes = dscene->data.bvh.have_curves && scene->params.use_bvh_unaligned_nodes; bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps; bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps; + bparams.bvh_type = scene->params.bvh_type; + bparams.curve_flags = dscene->data.curve.curveflags; + bparams.curve_subdivisions = dscene->data.curve.subdivisions; VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout) << " layout."; +#ifdef WITH_EMBREE + if(bparams.bvh_layout == BVH_LAYOUT_EMBREE) { + if(dscene->data.bvh.scene) { + BVHEmbree::destroy(dscene->data.bvh.scene); + } + } +#endif + BVH *bvh = BVH::create(bparams, scene->objects); - bvh->build(progress); + bvh->build(progress, &device->stats); if(progress.get_cancel()) { +#ifdef WITH_EMBREE + if(bparams.bvh_layout == BVH_LAYOUT_EMBREE) { + if(dscene->data.bvh.scene) { + BVHEmbree::destroy(dscene->data.bvh.scene); + } + } +#endif delete bvh; return; } @@ -1923,6 +1948,16 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene * dscene->data.bvh.bvh_layout = bparams.bvh_layout; dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0); + +#ifdef WITH_EMBREE + if(bparams.bvh_layout == BVH_LAYOUT_EMBREE) { + dscene->data.bvh.scene = ((BVHEmbree*)bvh)->scene; + } + else { + dscene->data.bvh.scene = NULL; + } +#endif + delete bvh; } @@ -2266,7 +2301,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) og->object_names.clear(); } #else - (void)device; + (void) device; #endif } diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h index 444f03a3664..7d36b2cd7ca 100644 --- a/intern/cycles/render/mesh.h +++ b/intern/cycles/render/mesh.h @@ -22,6 +22,7 @@ #include "render/attribute.h" #include "render/shader.h" +#include "util/util_array.h" #include "util/util_boundbox.h" #include "util/util_list.h" #include "util/util_map.h" @@ -390,4 +391,4 @@ protected: CCL_NAMESPACE_END -#endif /* __MESH_H__ */ +#endif /* __MESH_H__ */ diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index 28bbe2de05a..048f0fcaa24 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -20,6 +20,7 @@ #include "render/graph.h" #include "graph/node.h" +#include "util/util_array.h" #include "util/util_string.h" CCL_NAMESPACE_BEGIN @@ -1161,4 +1162,4 @@ public: CCL_NAMESPACE_END -#endif /* __NODES_H__ */ +#endif /* __NODES_H__ */ diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp index e3f35c366d6..dc7a1043208 100644 --- a/intern/cycles/render/object.cpp +++ b/intern/cycles/render/object.cpp @@ -27,7 +27,9 @@ #include "util/util_logging.h" #include "util/util_map.h" #include "util/util_progress.h" +#include "util/util_set.h" #include "util/util_vector.h" +#include "util/util_murmurhash.h" #include "subd/subd_patch_table.h" @@ -483,6 +485,10 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s kobject.numverts = mesh->verts.size(); kobject.patch_map_offset = 0; kobject.attribute_map_offset = 0; + uint32_t hash_name = util_murmur_hash3(ob->name.c_str(), ob->name.length(), 0); + uint32_t hash_asset = util_murmur_hash3(ob->asset_name.c_str(), ob->asset_name.length(), 0); + kobject.cryptomatte_object = util_hash_to_float(hash_name); + kobject.cryptomatte_asset = util_hash_to_float(hash_asset); /* Object flag. */ if(ob->use_holdout) { @@ -839,4 +845,37 @@ void ObjectManager::tag_update(Scene *scene) scene->light_manager->need_update = true; } +string ObjectManager::get_cryptomatte_objects(Scene *scene) +{ + string manifest = "{"; + + unordered_set<ustring, ustringHash> objects; + foreach(Object *object, scene->objects) { + if(objects.count(object->name)) { + continue; + } + objects.insert(object->name); + uint32_t hash_name = util_murmur_hash3(object->name.c_str(), object->name.length(), 0); + manifest += string_printf("\"%s\":\"%08x\",", object->name.c_str(), hash_name); + } + manifest[manifest.size()-1] = '}'; + return manifest; +} + +string ObjectManager::get_cryptomatte_assets(Scene *scene) +{ + string manifest = "{"; + unordered_set<ustring, ustringHash> assets; + foreach(Object *ob, scene->objects) { + if(assets.count(ob->asset_name)) { + continue; + } + assets.insert(ob->asset_name); + uint32_t hash_asset = util_murmur_hash3(ob->asset_name.c_str(), ob->asset_name.length(), 0); + manifest += string_printf("\"%s\":\"%08x\",", ob->asset_name.c_str(), hash_asset); + } + manifest[manifest.size()-1] = '}'; + return manifest; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h index b80c4aef70b..87e6e6652ad 100644 --- a/intern/cycles/render/object.h +++ b/intern/cycles/render/object.h @@ -20,11 +20,13 @@ #include "graph/node.h" #include "render/scene.h" +#include "util/util_array.h" #include "util/util_boundbox.h" #include "util/util_param.h" #include "util/util_transform.h" #include "util/util_thread.h" #include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -48,6 +50,7 @@ public: BoundBox bounds; uint random_id; int pass_id; + ustring asset_name; vector<ParamValue> attributes; uint visibility; array<Transform> motion; @@ -115,6 +118,9 @@ public: void apply_static_transforms(DeviceScene *dscene, Scene *scene, Progress& progress); + string get_cryptomatte_objects(Scene *scene); + string get_cryptomatte_assets(Scene *scene); + protected: void device_update_object_transform(UpdateObjectTransformState *state, Object *ob, @@ -128,4 +134,4 @@ protected: CCL_NAMESPACE_END -#endif /* __OBJECT_H__ */ +#endif /* __OBJECT_H__ */ diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp index 496e9d9491a..3fbc7d33a74 100644 --- a/intern/cycles/render/osl.cpp +++ b/intern/cycles/render/osl.cpp @@ -1255,6 +1255,6 @@ void OSLCompiler::parameter_color_array(const char * /*name*/, const array<float { } -#endif /* WITH_OSL */ +#endif /* WITH_OSL */ CCL_NAMESPACE_END diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h index 966fc1965d7..e196e0be787 100644 --- a/intern/cycles/render/osl.h +++ b/intern/cycles/render/osl.h @@ -17,6 +17,7 @@ #ifndef __OSL_H__ #define __OSL_H__ +#include "util/util_array.h" #include "util/util_set.h" #include "util/util_string.h" #include "util/util_thread.h" @@ -171,4 +172,4 @@ private: CCL_NAMESPACE_END -#endif /* __OSL_H__ */ +#endif /* __OSL_H__ */ diff --git a/intern/cycles/render/particles.h b/intern/cycles/render/particles.h index 7e7afd5d054..27821907af0 100644 --- a/intern/cycles/render/particles.h +++ b/intern/cycles/render/particles.h @@ -17,8 +17,8 @@ #ifndef __PARTICLES_H__ #define __PARTICLES_H__ +#include "util/util_array.h" #include "util/util_types.h" -#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -68,4 +68,4 @@ public: CCL_NAMESPACE_END -#endif /* __PARTICLES_H__ */ +#endif /* __PARTICLES_H__ */ diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp index 9f93fed139c..ccaca8707c8 100644 --- a/intern/cycles/render/scene.cpp +++ b/intern/cycles/render/scene.cpp @@ -215,6 +215,11 @@ void Scene::device_update(Device *device_, Progress& progress) object_manager->device_update(device, &dscene, this, progress); if(progress.get_cancel() || device->have_error()) return; + + progress.set_status("Updating Hair Systems"); + curve_system_manager->device_update(device, &dscene, this, progress); + + if(progress.get_cancel() || device->have_error()) return; progress.set_status("Updating Particle Systems"); particle_system_manager->device_update(device, &dscene, this, progress); @@ -240,12 +245,7 @@ void Scene::device_update(Device *device_, Progress& progress) camera->device_update_volume(device, &dscene, this); if(progress.get_cancel() || device->have_error()) return; - - progress.set_status("Updating Hair Systems"); - curve_system_manager->device_update(device, &dscene, this, progress); - - if(progress.get_cancel() || device->have_error()) return; - + progress.set_status("Updating Lookup Tables"); lookup_tables->device_update(device, &dscene); diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index dd8069537eb..57ea1d471e8 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -166,7 +166,6 @@ public: bool use_bvh_spatial_split; bool use_bvh_unaligned_nodes; int num_bvh_time_steps; - bool persistent_data; int texture_limit; @@ -269,4 +268,4 @@ protected: CCL_NAMESPACE_END -#endif /* __SCENE_H__ */ +#endif /* __SCENE_H__ */ diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index d0aa985b035..d6ecafa19b7 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -682,7 +682,10 @@ DeviceRequestedFeatures Session::get_requested_device_features() BakeManager *bake_manager = scene->bake_manager; requested_features.use_baking = bake_manager->get_baking(); requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH); - requested_features.use_denoising = params.use_denoising; + if(params.denoising_passes) { + requested_features.use_denoising = true; + requested_features.use_shadow_tricks = true; + } return requested_features; } diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 61f62f8e712..56a69919a7a 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -58,6 +58,7 @@ public: bool display_buffer_linear; bool use_denoising; + bool denoising_passes; int denoising_radius; float denoising_strength; float denoising_feature_strength; @@ -89,6 +90,7 @@ public: threads = 0; use_denoising = false; + denoising_passes = false; denoising_radius = 8; denoising_strength = 0.0f; denoising_feature_strength = 0.0f; @@ -236,4 +238,4 @@ protected: CCL_NAMESPACE_END -#endif /* __SESSION_H__ */ +#endif /* __SESSION_H__ */ diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index ac605305b94..d6c2d7502f2 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -30,6 +30,7 @@ #include "render/tables.h" #include "util/util_foreach.h" +#include "util/util_murmurhash.h" #ifdef WITH_OCIO # include <OpenColorIO/OpenColorIO.h> @@ -387,7 +388,7 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem) { ShaderManager *manager; - (void)shadingsystem; /* Ignored when built without OSL. */ + (void) shadingsystem; /* Ignored when built without OSL. */ #ifdef WITH_OSL if(shadingsystem == SHADINGSYSTEM_OSL) { @@ -523,12 +524,15 @@ void ShaderManager::device_update_common(Device *device, if(shader->is_constant_emission(&constant_emission)) flag |= SD_HAS_CONSTANT_EMISSION; + uint32_t cryptomatte_id = util_murmur_hash3(shader->name.c_str(), shader->name.length(), 0); + /* regular shader */ kshader->flags = flag; kshader->pass_id = shader->pass_id; kshader->constant_emission[0] = constant_emission.x; kshader->constant_emission[1] = constant_emission.y; kshader->constant_emission[2] = constant_emission.z; + kshader->cryptomatte_id = util_hash_to_float(cryptomatte_id); kshader++; has_transparent_shadow |= (flag & SD_HAS_TRANSPARENT_SHADOW) != 0; @@ -695,4 +699,20 @@ float ShaderManager::linear_rgb_to_gray(float3 c) return dot(c, rgb_to_y); } +string ShaderManager::get_cryptomatte_materials(Scene *scene) +{ + string manifest = "{"; + unordered_set<ustring, ustringHash> materials; + foreach(Shader *shader, scene->shaders) { + if(materials.count(shader->name)) { + continue; + } + materials.insert(shader->name); + uint32_t cryptomatte_id = util_murmur_hash3(shader->name.c_str(), shader->name.length(), 0); + manifest += string_printf("\"%s\":\"%08x\",", shader->name.c_str(), cryptomatte_id); + } + manifest[manifest.size()-1] = '}'; + return manifest; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h index 58314a1e310..4c7b2fd433b 100644 --- a/intern/cycles/render/shader.h +++ b/intern/cycles/render/shader.h @@ -197,6 +197,8 @@ public: float linear_rgb_to_gray(float3 c); + string get_cryptomatte_materials(Scene *scene); + protected: ShaderManager(); @@ -222,4 +224,4 @@ protected: CCL_NAMESPACE_END -#endif /* __SHADER_H__ */ +#endif /* __SHADER_H__ */ diff --git a/intern/cycles/render/sobol.h b/intern/cycles/render/sobol.h index d38857d2b35..ce7a28587f2 100644 --- a/intern/cycles/render/sobol.h +++ b/intern/cycles/render/sobol.h @@ -28,4 +28,4 @@ void sobol_generate_direction_vectors(uint vectors[][SOBOL_BITS], int dimensions CCL_NAMESPACE_END -#endif /* __SOBOL_H__ */ +#endif /* __SOBOL_H__ */ diff --git a/intern/cycles/render/stats.h b/intern/cycles/render/stats.h index 72d5f1dd93d..2ff0ec3e0e9 100644 --- a/intern/cycles/render/stats.h +++ b/intern/cycles/render/stats.h @@ -101,4 +101,4 @@ public: CCL_NAMESPACE_END -#endif /* __RENDER_STATS_H__ */ +#endif /* __RENDER_STATS_H__ */ diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp index 44b7eeec6db..b380117e729 100644 --- a/intern/cycles/render/svm.cpp +++ b/intern/cycles/render/svm.cpp @@ -266,7 +266,7 @@ int SVMCompiler::stack_assign(ShaderInput *input) add_node(NODE_VALUE_V, input->stack_offset); add_node(NODE_VALUE_V, node->get_float3(input->socket_type)); } - else /* should not get called for closure */ + else /* should not get called for closure */ assert(0); } } diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h index 1d0613bbfdc..af97a490a87 100644 --- a/intern/cycles/render/svm.h +++ b/intern/cycles/render/svm.h @@ -21,6 +21,7 @@ #include "render/graph.h" #include "render/shader.h" +#include "util/util_array.h" #include "util/util_set.h" #include "util/util_string.h" #include "util/util_thread.h" @@ -223,4 +224,4 @@ protected: CCL_NAMESPACE_END -#endif /* __SVM_H__ */ +#endif /* __SVM_H__ */ diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h index 09d961a9c3c..709333cb1b6 100644 --- a/intern/cycles/render/tables.h +++ b/intern/cycles/render/tables.h @@ -50,4 +50,4 @@ public: CCL_NAMESPACE_END -#endif /* __TABLES_H__ */ +#endif /* __TABLES_H__ */ diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index f72f653f4c2..2835c793073 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -154,4 +154,4 @@ protected: CCL_NAMESPACE_END -#endif /* __TILE_H__ */ +#endif /* __TILE_H__ */ diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h index 4617c782b3a..2bef8d4cf8d 100644 --- a/intern/cycles/subd/subd_dice.h +++ b/intern/cycles/subd/subd_dice.h @@ -134,4 +134,4 @@ public: CCL_NAMESPACE_END -#endif /* __SUBD_DICE_H__ */ +#endif /* __SUBD_DICE_H__ */ diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h index 64ec8f70951..84100139f2c 100644 --- a/intern/cycles/subd/subd_patch.h +++ b/intern/cycles/subd/subd_patch.h @@ -56,4 +56,4 @@ public: CCL_NAMESPACE_END -#endif /* __SUBD_PATCH_H__ */ +#endif /* __SUBD_PATCH_H__ */ diff --git a/intern/cycles/subd/subd_patch_table.cpp b/intern/cycles/subd/subd_patch_table.cpp index 13a6f284542..0e9d3f37af4 100644 --- a/intern/cycles/subd/subd_patch_table.cpp +++ b/intern/cycles/subd/subd_patch_table.cpp @@ -252,8 +252,8 @@ void PackedPatchTable::pack(Far::PatchTable* patch_table, int offset) build_patch_map(*this, patch_table, offset); #else - (void)patch_table; - (void)offset; + (void) patch_table; + (void) offset; #endif } diff --git a/intern/cycles/subd/subd_patch_table.h b/intern/cycles/subd/subd_patch_table.h index 45be7051992..1765578c42e 100644 --- a/intern/cycles/subd/subd_patch_table.h +++ b/intern/cycles/subd/subd_patch_table.h @@ -17,8 +17,8 @@ #ifndef __SUBD_PATCH_TABLE_H__ #define __SUBD_PATCH_TABLE_H__ +#include "util/util_array.h" #include "util/util_types.h" -#include "util/util_vector.h" #ifdef WITH_OPENSUBDIV #ifdef _MSC_VER @@ -59,4 +59,4 @@ struct PackedPatchTable { CCL_NAMESPACE_END -#endif /* __SUBD_PATCH_TABLE_H__ */ +#endif /* __SUBD_PATCH_TABLE_H__ */ diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h index 7a276b35382..3368c93944b 100644 --- a/intern/cycles/subd/subd_split.h +++ b/intern/cycles/subd/subd_split.h @@ -56,4 +56,4 @@ public: CCL_NAMESPACE_END -#endif /* __SUBD_SPLIT_H__ */ +#endif /* __SUBD_SPLIT_H__ */ diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp index b66a91adbda..cfdab7a6433 100644 --- a/intern/cycles/test/render_graph_finalize_test.cpp +++ b/intern/cycles/test/render_graph_finalize_test.cpp @@ -20,6 +20,7 @@ #include "render/graph.h" #include "render/scene.h" #include "render/nodes.h" +#include "util/util_array.h" #include "util/util_logging.h" #include "util/util_string.h" #include "util/util_vector.h" diff --git a/intern/cycles/test/util_path_test.cpp b/intern/cycles/test/util_path_test.cpp index c2f400c105d..1df568493d8 100644 --- a/intern/cycles/test/util_path_test.cpp +++ b/intern/cycles/test/util_path_test.cpp @@ -370,7 +370,7 @@ TEST(util_path_is_relative, relative_windir_on_unix) bool is_relative = path_is_relative("tmp\\foo.txt"); EXPECT_TRUE(is_relative); } -#endif /* !_WIN32 */ +#endif /* !_WIN32 */ #ifdef _WIN32 TEST(util_path_is_relative, absolute_windows) @@ -396,6 +396,6 @@ TEST(util_path_is_relative, relative_unixdir_on_windows) bool is_relative = path_is_relative("tmp/foo.txt"); EXPECT_TRUE(is_relative); } -#endif /* _WIN32 */ +#endif /* _WIN32 */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 291f9a9fcae..77d47984ee7 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -15,6 +15,7 @@ set(SRC util_logging.cpp util_math_cdf.cpp util_md5.cpp + util_murmurhash.cpp util_path.cpp util_string.cpp util_simd.cpp @@ -36,6 +37,7 @@ set(SRC_HEADERS util_algorithm.h util_aligned_malloc.h util_args.h + util_array.h util_atomic.h util_boundbox.h util_debug.h @@ -64,6 +66,7 @@ set(SRC_HEADERS util_math_int4.h util_math_matrix.h util_md5.h + util_murmurhash.h util_opengl.h util_optimization.h util_param.h diff --git a/intern/cycles/util/util_algorithm.h b/intern/cycles/util/util_algorithm.h index eb874713d43..f9e6476cc52 100644 --- a/intern/cycles/util/util_algorithm.h +++ b/intern/cycles/util/util_algorithm.h @@ -29,4 +29,4 @@ using std::remove; CCL_NAMESPACE_END -#endif /* __UTIL_ALGORITHM_H__ */ +#endif /* __UTIL_ALGORITHM_H__ */ diff --git a/intern/cycles/util/util_args.h b/intern/cycles/util/util_args.h index be6f2c2b9f1..9fe54b14d77 100644 --- a/intern/cycles/util/util_args.h +++ b/intern/cycles/util/util_args.h @@ -28,4 +28,4 @@ OIIO_NAMESPACE_USING CCL_NAMESPACE_END -#endif /* __UTIL_ARGS_H__ */ +#endif /* __UTIL_ARGS_H__ */ diff --git a/intern/cycles/util/util_array.h b/intern/cycles/util/util_array.h new file mode 100644 index 00000000000..5f18d434c31 --- /dev/null +++ b/intern/cycles/util/util_array.h @@ -0,0 +1,289 @@ +/* + * Copyright 2011-2018 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_ARRAY_H__ +#define __UTIL_ARRAY_H__ + +#include <cassert> +#include <cstring> + +#include "util/util_aligned_malloc.h" +#include "util/util_guarded_allocator.h" +#include "util/util_types.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +/* Simplified version of vector, serving multiple purposes: + * - somewhat faster in that it does not clear memory on resize/alloc, + * this was actually showing up in profiles quite significantly. it + * also does not run any constructors/destructors + * - if this is used, we are not tempted to use inefficient operations + * - aligned allocation for CPU native data types */ + +template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES> +class array +{ +public: + array() + : data_(NULL), + datasize_(0), + capacity_(0) + {} + + explicit array(size_t newsize) + { + if(newsize == 0) { + data_ = NULL; + datasize_ = 0; + capacity_ = 0; + } + else { + data_ = mem_allocate(newsize); + datasize_ = newsize; + capacity_ = datasize_; + } + } + + array(const array& from) + { + if(from.datasize_ == 0) { + data_ = NULL; + datasize_ = 0; + capacity_ = 0; + } + else { + data_ = mem_allocate(from.datasize_); + memcpy(data_, from.data_, from.datasize_*sizeof(T)); + datasize_ = from.datasize_; + capacity_ = datasize_; + } + } + + array& operator=(const array& from) + { + if(this != &from) { + resize(from.size()); + memcpy((void*)data_, from.data_, datasize_*sizeof(T)); + } + + return *this; + } + + array& operator=(const vector<T>& from) + { + resize(from.size()); + + if(from.size() > 0) { + memcpy(data_, &from[0], datasize_*sizeof(T)); + } + + return *this; + } + + ~array() + { + mem_free(data_, capacity_); + } + + bool operator==(const array<T>& other) const + { + if(datasize_ != other.datasize_) { + return false; + } + + return memcmp(data_, other.data_, datasize_*sizeof(T)) == 0; + } + + bool operator!=(const array<T>& other) const + { + return !(*this == other); + } + + void steal_data(array& from) + { + if(this != &from) { + clear(); + + data_ = from.data_; + datasize_ = from.datasize_; + capacity_ = from.capacity_; + + from.data_ = NULL; + from.datasize_ = 0; + from.capacity_ = 0; + } + } + + T *steal_pointer() + { + T *ptr = data_; + data_ = NULL; + clear(); + return ptr; + } + + T* resize(size_t newsize) + { + if(newsize == 0) { + clear(); + } + else if(newsize != datasize_) { + if(newsize > capacity_) { + T *newdata = mem_allocate(newsize); + if(newdata == NULL) { + /* Allocation failed, likely out of memory. */ + clear(); + return NULL; + } + else if(data_ != NULL) { + memcpy((void *)newdata, + data_, + ((datasize_ < newsize)? datasize_: newsize)*sizeof(T)); + mem_free(data_, capacity_); + } + data_ = newdata; + capacity_ = newsize; + } + datasize_ = newsize; + } + return data_; + } + + T* resize(size_t newsize, const T& value) + { + size_t oldsize = size(); + resize(newsize); + + for(size_t i = oldsize; i < size(); i++) { + data_[i] = value; + } + + return data_; + } + + void clear() + { + if(data_ != NULL) { + mem_free(data_, capacity_); + data_ = NULL; + } + datasize_ = 0; + capacity_ = 0; + } + + size_t empty() const + { + return datasize_ == 0; + } + + size_t size() const + { + return datasize_; + } + + T* data() + { + return data_; + } + + const T* data() const + { + return data_; + } + + T& operator[](size_t i) const + { + assert(i < datasize_); + return data_[i]; + } + + void reserve(size_t newcapacity) + { + if(newcapacity > capacity_) { + T *newdata = mem_allocate(newcapacity); + if(data_ != NULL) { + memcpy(newdata, data_, ((datasize_ < newcapacity)? datasize_: newcapacity)*sizeof(T)); + mem_free(data_, capacity_); + } + data_ = newdata; + capacity_ = newcapacity; + } + } + + size_t capacity() const + { + return capacity_; + } + + // do not use this method unless you are sure the code is not performance critical + void push_back_slow(const T& t) + { + if(capacity_ == datasize_) + { + reserve(datasize_ == 0 ? 1 : (size_t)((datasize_ + 1) * 1.2)); + } + + data_[datasize_++] = t; + } + + void push_back_reserved(const T& t) + { + assert(datasize_ < capacity_); + push_back_slow(t); + } + + void append(const array<T>& from) + { + if(from.size()) { + size_t old_size = size(); + resize(old_size + from.size()); + memcpy(data_ + old_size, from.data(), sizeof(T) * from.size()); + } + } + +protected: + inline T* mem_allocate(size_t N) + { + if(N == 0) { + return NULL; + } + T *mem = (T*)util_aligned_malloc(sizeof(T)*N, alignment); + if(mem != NULL) { + util_guarded_mem_alloc(sizeof(T)*N); + } + else { + throw std::bad_alloc(); + } + return mem; + } + + inline void mem_free(T *mem, size_t N) + { + if(mem != NULL) { + util_guarded_mem_free(sizeof(T)*N); + util_aligned_free(mem); + } + } + + T *data_; + size_t datasize_; + size_t capacity_; +}; + +CCL_NAMESPACE_END + +#endif /* __UTIL_ARRAY_H__ */ diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index f3c7ae546a0..477b667a6fe 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -23,12 +23,13 @@ #include "atomic_ops.h" #define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x)) +#define atomic_compare_and_swap_float(p, old_val, new_val) atomic_cas_float((p), (old_val), (new_val)) #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) #define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1) #define CCL_LOCAL_MEM_FENCE 0 -#define ccl_barrier(flags) (void)0 +#define ccl_barrier(flags) ((void) 0) #else /* __KERNEL_GPU__ */ @@ -57,6 +58,20 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so return new_value.float_value; } +ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest, + const float old_val, const float new_val) +{ + union { + unsigned int int_value; + float float_value; + } new_value, prev_value, result; + prev_value.float_value = old_val; + new_value.float_value = new_val; + result.int_value = atomic_cmpxchg((volatile ccl_global unsigned int *)dest, + prev_value.int_value, new_value.int_value); + return result.float_value; +} + #define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x)) #define atomic_fetch_and_inc_uint32(p) atomic_inc((p)) #define atomic_fetch_and_dec_uint32(p) atomic_dec((p)) @@ -75,6 +90,19 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) #define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1) +ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest, + const float old_val, const float new_val) +{ + union { + unsigned int int_value; + float float_value; + } new_value, prev_value, result; + prev_value.float_value = old_val; + new_value.float_value = new_val; + result.int_value = atomicCAS((unsigned int *)dest, prev_value.int_value,new_value.int_value); + return result.float_value; +} + #define CCL_LOCAL_MEM_FENCE #define ccl_barrier(flags) __syncthreads() @@ -82,4 +110,4 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so #endif /* __KERNEL_GPU__ */ -#endif /* __UTIL_ATOMIC_H__ */ +#endif /* __UTIL_ATOMIC_H__ */ diff --git a/intern/cycles/util/util_avxb.h b/intern/cycles/util/util_avxb.h index 60d9bb44256..25ef39d39ae 100644 --- a/intern/cycles/util/util_avxb.h +++ b/intern/cycles/util/util_avxb.h @@ -44,23 +44,12 @@ struct avxb __forceinline operator const __m256i( void ) const { return _mm256_castps_si256(m256); } __forceinline operator const __m256d( void ) const { return _mm256_castps_pd(m256); } - //__forceinline avxb ( bool a ) - // : m256(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {} - //__forceinline avxb ( bool a, bool b) - // : m256(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {} - //__forceinline avxb ( bool a, bool b, bool c, bool d) - // : m256(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {} - //__forceinline avxb(int mask) { - // assert(mask >= 0 && mask < 16); - // m128 = _mm_lookupmask_ps[mask]; - //} - //////////////////////////////////////////////////////////////////////////////// /// Constants //////////////////////////////////////////////////////////////////////////////// __forceinline avxb( FalseTy ) : m256(_mm256_setzero_ps()) {} - __forceinline avxb( TrueTy ) : m256(_mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_setzero_si256(), _mm256_setzero_si256()))) {} + __forceinline avxb( TrueTy ) : m256(_mm256_castsi256_ps(_mm256_set1_epi32(-1))) {} //////////////////////////////////////////////////////////////////////////////// /// Array Access @@ -97,7 +86,21 @@ __forceinline const avxb operator ^=( avxb& a, const avxb& b ) { return a = a ^ //////////////////////////////////////////////////////////////////////////////// __forceinline const avxb operator !=( const avxb& a, const avxb& b ) { return _mm256_xor_ps(a, b); } -__forceinline const avxb operator ==( const avxb& a, const avxb& b ) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); } +__forceinline const avxb operator ==( const avxb& a, const avxb& b ) +{ +#ifdef __KERNEL_AVX2__ + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); +#else + __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0)); + __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1)); + __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0)); + __m128i b_hi = _mm_castps_si128(_mm256_extractf128_ps(b, 1)); + __m128i c_lo = _mm_cmpeq_epi32(a_lo, b_lo); + __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi); + __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1); + return _mm256_castsi256_ps(result); +#endif +} __forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) { #if defined(__KERNEL_SSE41__) @@ -114,47 +117,6 @@ __forceinline const avxb select( const avxb& m, const avxb& t, const avxb& f ) { __forceinline const avxb unpacklo( const avxb& a, const avxb& b ) { return _mm256_unpacklo_ps(a, b); } __forceinline const avxb unpackhi( const avxb& a, const avxb& b ) { return _mm256_unpackhi_ps(a, b); } -#define _MM256_SHUFFLE(fp7,fp6,fp5,fp4,fp3,fp2,fp1,fp0) (((fp7) << 14) | ((fp6) << 12) | ((fp5) << 10) | ((fp4) << 8) | \ - ((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) - -template<size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7> -__forceinline const avxb shuffle( const avxb& a ) { - return _mm256_cvtepi32_ps(_mm256_shuffle_epi32(a, _MM256_SHUFFLE(i7, i6, i5, i4, i3, i2, i1, i0))); -} - -/* -template<> __forceinline const avxb shuffle<0, 1, 0, 1, 0, 1, 0, 1>( const avxb& a ) { - return _mm_movelh_ps(a, a); -} - -template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a ) { - return _mm_movehl_ps(a, a); -} - -template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a, const sseb& b ) { - return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)); -} - -template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a, const sseb& b ) { - return _mm_movelh_ps(a, b); -} - -template<> __forceinline const sseb shuffle<2, 3, 2, 3>( const sseb& a, const sseb& b ) { - return _mm_movehl_ps(b, a); -} - -#if defined(__KERNEL_SSE3__) -template<> __forceinline const sseb shuffle<0, 0, 2, 2>( const sseb& a ) { return _mm_moveldup_ps(a); } -template<> __forceinline const sseb shuffle<1, 1, 3, 3>( const sseb& a ) { return _mm_movehdup_ps(a); } -#endif - -#if defined(__KERNEL_SSE41__) -template<size_t dst, size_t src, size_t clr> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); } -template<size_t dst, size_t src> __forceinline const sseb insert( const sseb& a, const sseb& b ) { return insert<dst, src, 0>(a, b); } -template<size_t dst> __forceinline const sseb insert( const sseb& a, const bool b ) { return insert<dst,0>(a, sseb(b)); } -#endif -*/ - //////////////////////////////////////////////////////////////////////////////// /// Reduction Operations //////////////////////////////////////////////////////////////////////////////// @@ -180,7 +142,7 @@ __forceinline size_t movemask( const avxb& a ) { return _mm256_movemask_ps(a); } ccl_device_inline void print_avxb(const char *label, const avxb &a) { - printf("%s: %df %df %df %df %df %df %df %d\n", + printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]); } diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h index 5596702ca20..f00c722f25b 100644 --- a/intern/cycles/util/util_avxf.h +++ b/intern/cycles/util/util_avxf.h @@ -40,8 +40,8 @@ struct avxf __forceinline avxf(const __m256 a) : m256(a) {} __forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps (a)) {} - __forceinline operator const __m256&(void) const { return m256; } - __forceinline operator __m256&(void) { return m256; } + __forceinline operator const __m256&() const { return m256; } + __forceinline operator __m256&() { return m256; } __forceinline avxf (float a) : m256(_mm256_set1_ps(a)) {} @@ -214,17 +214,19 @@ __forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) { #endif } __forceinline const avxf msub(const avxf& a, const avxf& b, const avxf& c) { +#ifdef __KERNEL_AVX2__ return _mm256_fmsub_ps(a, b, c); +#else + return (a*b) - c; +#endif } //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators //////////////////////////////////////////////////////////////////////////////// -#ifdef __KERNEL_AVX2__ __forceinline const avxb operator <=(const avxf& a, const avxf& b) { return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS); } -#endif #endif diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h index b1bd5be0df3..fe89e398840 100644 --- a/intern/cycles/util/util_boundbox.h +++ b/intern/cycles/util/util_boundbox.h @@ -282,4 +282,4 @@ public: CCL_NAMESPACE_END -#endif /* __UTIL_BOUNDBOX_H__ */ +#endif /* __UTIL_BOUNDBOX_H__ */ diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h index 826db469d25..e6efc7d73fc 100644 --- a/intern/cycles/util/util_color.h +++ b/intern/cycles/util/util_color.h @@ -240,4 +240,4 @@ ccl_device float4 color_srgb_to_linear_v4(float4 c) CCL_NAMESPACE_END -#endif /* __UTIL_COLOR_H__ */ +#endif /* __UTIL_COLOR_H__ */ diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index f17f8a560ee..864089bb118 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -182,4 +182,4 @@ std::ostream& operator <<(std::ostream &os, CCL_NAMESPACE_END -#endif /* __UTIL_DEBUG_H__ */ +#endif /* __UTIL_DEBUG_H__ */ diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h index 8bce4aca699..429cfe647ef 100644 --- a/intern/cycles/util/util_defines.h +++ b/intern/cycles/util/util_defines.h @@ -72,7 +72,7 @@ # if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ # define ATTR_FALLTHROUGH __attribute__((fallthrough)) # else -# define ATTR_FALLTHROUGH ((void)0) +# define ATTR_FALLTHROUGH ((void) 0) # endif #endif /* __KERNEL_GPU__ */ @@ -104,14 +104,14 @@ template<typename T> static inline T decltype_helper(T x) { return x; } #define CHECK_TYPE(var, type) { \ TYPEOF(var) *__tmp; \ __tmp = (type *)NULL; \ - (void)__tmp; \ -} (void)0 + (void) __tmp; \ +} (void) 0 #define CHECK_TYPE_PAIR(var_a, var_b) { \ TYPEOF(var_a) *__tmp; \ __tmp = (typeof(var_b) *)NULL; \ - (void)__tmp; \ -} (void)0 + (void) __tmp; \ +} (void) 0 #else # define CHECK_TYPE(var, type) # define CHECK_TYPE_PAIR(var_a, var_b) @@ -128,4 +128,4 @@ template<typename T> static inline T decltype_helper(T x) { return x; } # define util_assert(statement) #endif -#endif /* __UTIL_DEFINES_H__ */ +#endif /* __UTIL_DEFINES_H__ */ diff --git a/intern/cycles/util/util_foreach.h b/intern/cycles/util/util_foreach.h index 2a74ff0a55d..fd106d58b43 100644 --- a/intern/cycles/util/util_foreach.h +++ b/intern/cycles/util/util_foreach.h @@ -21,4 +21,4 @@ #define foreach(x, y) for(x : y) -#endif /* __UTIL_FOREACH_H__ */ +#endif /* __UTIL_FOREACH_H__ */ diff --git a/intern/cycles/util/util_function.h b/intern/cycles/util/util_function.h index f3cc00329ad..72c7ce43073 100644 --- a/intern/cycles/util/util_function.h +++ b/intern/cycles/util/util_function.h @@ -36,4 +36,4 @@ using std::placeholders::_9; CCL_NAMESPACE_END -#endif /* __UTIL_FUNCTION_H__ */ +#endif /* __UTIL_FUNCTION_H__ */ diff --git a/intern/cycles/util/util_guarded_allocator.cpp b/intern/cycles/util/util_guarded_allocator.cpp index 54fa6a80df5..ae1d217c54f 100644 --- a/intern/cycles/util/util_guarded_allocator.cpp +++ b/intern/cycles/util/util_guarded_allocator.cpp @@ -35,12 +35,12 @@ void util_guarded_mem_free(size_t n) /* Public API. */ -size_t util_guarded_get_mem_used(void) +size_t util_guarded_get_mem_used() { return global_stats.mem_used; } -size_t util_guarded_get_mem_peak(void) +size_t util_guarded_get_mem_peak() { return global_stats.mem_peak; } diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h index 87c1526dee4..2c6f1790fd0 100644 --- a/intern/cycles/util/util_guarded_allocator.h +++ b/intern/cycles/util/util_guarded_allocator.h @@ -47,7 +47,7 @@ public: T *allocate(size_t n, const void *hint = 0) { - (void)hint; + (void) hint; size_t size = n * sizeof(T); util_guarded_mem_alloc(size); if(n == 0) { @@ -158,8 +158,8 @@ public: }; /* Get memory usage and peak from the guarded STL allocator. */ -size_t util_guarded_get_mem_used(void); -size_t util_guarded_get_mem_peak(void); +size_t util_guarded_get_mem_used(); +size_t util_guarded_get_mem_peak(); /* Call given function and keep track if it runs out of memory. * diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index 53b7f2472bd..3868509c21b 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -148,4 +148,4 @@ ccl_device_inline half float_to_half(float f) CCL_NAMESPACE_END -#endif /* __UTIL_HALF_H__ */ +#endif /* __UTIL_HALF_H__ */ diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h index a8a5076fbb3..f343252eaca 100644 --- a/intern/cycles/util/util_hash.h +++ b/intern/cycles/util/util_hash.h @@ -68,4 +68,4 @@ ccl_device_inline float hash_int_01(uint k) CCL_NAMESPACE_END -#endif /* __UTIL_HASH_H__ */ +#endif /* __UTIL_HASH_H__ */ diff --git a/intern/cycles/util/util_ies.cpp b/intern/cycles/util/util_ies.cpp index e068957325b..e1de2e0c6e4 100644 --- a/intern/cycles/util/util_ies.cpp +++ b/intern/cycles/util/util_ies.cpp @@ -21,6 +21,13 @@ CCL_NAMESPACE_BEGIN +// NOTE: For some reason gcc-7.2 does not instantiate this versio of allocator +// gere (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8. +// +// TODO(sergey): Get to the root of this issue, or confirm this i a compiler +// issue. +template class GuardedAllocator<char>; + bool IESFile::load(ustring ies) { clear(); diff --git a/intern/cycles/util/util_ies.h b/intern/cycles/util/util_ies.h index 5933cb3962a..663ad649a9c 100644 --- a/intern/cycles/util/util_ies.h +++ b/intern/cycles/util/util_ies.h @@ -58,4 +58,4 @@ protected: CCL_NAMESPACE_END -#endif /* __UTIL_IES_H__ */ +#endif /* __UTIL_IES_H__ */ diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h index 85bdb0d8050..da5f56271c8 100644 --- a/intern/cycles/util/util_image.h +++ b/intern/cycles/util/util_image.h @@ -102,6 +102,6 @@ inline half util_image_cast_from_float(float value) CCL_NAMESPACE_END -#endif /* __UTIL_IMAGE_H__ */ +#endif /* __UTIL_IMAGE_H__ */ #include "util/util_image_impl.h" diff --git a/intern/cycles/util/util_list.h b/intern/cycles/util/util_list.h index f555b001186..fcf8e4f5c74 100644 --- a/intern/cycles/util/util_list.h +++ b/intern/cycles/util/util_list.h @@ -25,4 +25,4 @@ using std::list; CCL_NAMESPACE_END -#endif /* __UTIL_LIST_H__ */ +#endif /* __UTIL_LIST_H__ */ diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp index f38683bf7de..b0922db32fb 100644 --- a/intern/cycles/util/util_logging.cpp +++ b/intern/cycles/util/util_logging.cpp @@ -45,7 +45,7 @@ void util_logging_init(const char *argv0) #endif } -void util_logging_start(void) +void util_logging_start() { #ifdef WITH_CYCLES_LOGGING using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption; diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h index 5c84b6593d3..f66d7c92dcc 100644 --- a/intern/cycles/util/util_logging.h +++ b/intern/cycles/util/util_logging.h @@ -41,7 +41,7 @@ public: void operator&(StubStream&) { } }; -# define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream() +# define LOG_SUPPRESS() (true) ? ((void) 0) : LogMessageVoidify() & StubStream() # define LOG(severity) LOG_SUPPRESS() # define VLOG(severity) LOG_SUPPRESS() #endif @@ -52,7 +52,7 @@ struct int2; struct float3; void util_logging_init(const char *argv0); -void util_logging_start(void); +void util_logging_start(); void util_logging_verbosity_set(int verbosity); std::ostream& operator <<(std::ostream &os, @@ -62,4 +62,4 @@ std::ostream& operator <<(std::ostream &os, CCL_NAMESPACE_END -#endif /* __UTIL_LOGGING_H__ */ +#endif /* __UTIL_LOGGING_H__ */ diff --git a/intern/cycles/util/util_map.h b/intern/cycles/util/util_map.h index 3c9288417cf..1952d33ada8 100644 --- a/intern/cycles/util/util_map.h +++ b/intern/cycles/util/util_map.h @@ -28,4 +28,4 @@ using std::unordered_map; CCL_NAMESPACE_END -#endif /* __UTIL_MAP_H__ */ +#endif /* __UTIL_MAP_H__ */ diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 52aeb8d8599..6167119f873 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -157,7 +157,7 @@ ccl_device_inline T max4(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_GPU__ */ ccl_device_inline float min4(float a, float b, float c, float d) { @@ -220,7 +220,31 @@ ccl_device_inline float __uint_as_float(uint i) u.i = i; return u.f; } -#endif /* __KERNEL_OPENCL__ */ + +ccl_device_inline int4 __float4_as_int4(float4 f) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_castps_si128(f.m128)); + #else + return make_int4(__float_as_int(f.x), + __float_as_int(f.y), + __float_as_int(f.z), + __float_as_int(f.w)); +#endif +} + +ccl_device_inline float4 __int4_as_float4(int4 i) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_castsi128_ps(i.m128)); +#else + return make_float4(__int_as_float(i.x), + __int_as_float(i.y), + __int_as_float(i.z), + __int_as_float(i.w)); +#endif +} +#endif /* __KERNEL_OPENCL__ */ /* Versions of functions which are safe for fast math. */ ccl_device_inline bool isnan_safe(float f) @@ -615,4 +639,4 @@ ccl_device_inline float2 map_to_sphere(const float3 co) CCL_NAMESPACE_END -#endif /* __UTIL_MATH_H__ */ +#endif /* __UTIL_MATH_H__ */ diff --git a/intern/cycles/util/util_math_cdf.h b/intern/cycles/util/util_math_cdf.h index 79643fe26e3..983855e3e9b 100644 --- a/intern/cycles/util/util_math_cdf.h +++ b/intern/cycles/util/util_math_cdf.h @@ -75,4 +75,4 @@ void util_cdf_inverted(const int resolution, CCL_NAMESPACE_END -#endif /* __UTIL_MATH_H_CDF__ */ +#endif /* __UTIL_MATH_H_CDF__ */ diff --git a/intern/cycles/util/util_math_fast.h b/intern/cycles/util/util_math_fast.h index d3960deb3b4..323d40058e5 100644 --- a/intern/cycles/util/util_math_fast.h +++ b/intern/cycles/util/util_math_fast.h @@ -58,6 +58,11 @@ ccl_device_inline float madd(const float a, const float b, const float c) return a * b + c; } +ccl_device_inline float4 madd4(const float4 a, const float4 b, const float4 c) +{ + return a * b + c; +} + /* * FAST & APPROXIMATE MATH * @@ -438,6 +443,29 @@ ccl_device_inline float fast_expf(float x) return fast_exp2f(x / M_LN2_F); } +#ifndef __KERNEL_GPU__ +ccl_device float4 fast_exp2f4(float4 x) +{ + const float4 one = make_float4(1.0f); + const float4 limit = make_float4(126.0f); + x = clamp(x, -limit, limit); + int4 m = make_int4(x); + x = one - (one - (x - make_float4(m))); + float4 r = make_float4(1.33336498402e-3f); + r = madd4(x, r, make_float4(9.810352697968e-3f)); + r = madd4(x, r, make_float4(5.551834031939e-2f)); + r = madd4(x, r, make_float4(0.2401793301105f)); + r = madd4(x, r, make_float4(0.693144857883f)); + r = madd4(x, r, make_float4(1.0f)); + return __int4_as_float4(__float4_as_int4(r) + (m << 23)); +} + +ccl_device_inline float4 fast_expf4(float4 x) +{ + return fast_exp2f4(x / M_LN2_F); +} +#endif + ccl_device_inline float fast_exp10(float x) { /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h index 6f9d0855d50..e937509367f 100644 --- a/intern/cycles/util/util_math_float2.h +++ b/intern/cycles/util/util_math_float2.h @@ -224,4 +224,4 @@ ccl_device_inline float2 interp(const float2& a, const float2& b, float t) CCL_NAMESPACE_END -#endif /* __UTIL_MATH_FLOAT2_H__ */ +#endif /* __UTIL_MATH_FLOAT2_H__ */ diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index 75265c1c9a2..a54a3f3087c 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -419,4 +419,4 @@ ccl_device_inline float3 ensure_finite3(float3 v) CCL_NAMESPACE_END -#endif /* __UTIL_MATH_FLOAT3_H__ */ +#endif /* __UTIL_MATH_FLOAT3_H__ */ diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h index aa7e56fefe9..479ccf202ba 100644 --- a/intern/cycles/util/util_math_float4.h +++ b/intern/cycles/util/util_math_float4.h @@ -38,6 +38,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b); ccl_device_inline float4 operator-(const float4& a, const float4& b); ccl_device_inline float4 operator+=(float4& a, const float4& b); ccl_device_inline float4 operator*=(float4& a, const float4& b); +ccl_device_inline float4 operator*=(float4& a, float f); ccl_device_inline float4 operator/=(float4& a, float f); ccl_device_inline int4 operator<(const float4& a, const float4& b); @@ -58,6 +59,7 @@ ccl_device_inline float4 normalize(const float4& a); ccl_device_inline float4 safe_normalize(const float4& a); ccl_device_inline float4 min(const float4& a, const float4& b); ccl_device_inline float4 max(const float4& a, const float4& b); +ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx); ccl_device_inline float4 fabs(const float4& a); #endif /* !__KERNEL_OPENCL__*/ @@ -168,6 +170,11 @@ ccl_device_inline float4 operator*=(float4& a, const float4& b) return a = a * b; } +ccl_device_inline float4 operator*=(float4& a, float f) +{ + return a = a * f; +} + ccl_device_inline float4 operator/=(float4& a, float f) { return a = a / f; @@ -333,6 +340,11 @@ ccl_device_inline float4 max(const float4& a, const float4& b) #endif } +ccl_device_inline float4 clamp(const float4& a, const float4& mn, const float4& mx) +{ + return min(max(a, mn), mx); +} + ccl_device_inline float4 fabs(const float4& a) { #ifdef __KERNEL_SSE__ @@ -445,4 +457,4 @@ ccl_device_inline float4 load_float4(const float *v) CCL_NAMESPACE_END -#endif /* __UTIL_MATH_FLOAT4_H__ */ +#endif /* __UTIL_MATH_FLOAT4_H__ */ diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h index 828c49a131c..dd401d9a091 100644 --- a/intern/cycles/util/util_math_int2.h +++ b/intern/cycles/util/util_math_int2.h @@ -74,4 +74,4 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b) CCL_NAMESPACE_END -#endif /* __UTIL_MATH_INT2_H__ */ +#endif /* __UTIL_MATH_INT2_H__ */ diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h index 81b10f31f4a..2f4752f90f1 100644 --- a/intern/cycles/util/util_math_int3.h +++ b/intern/cycles/util/util_math_int3.h @@ -113,4 +113,4 @@ ccl_device_inline int3 operator-(const int3 &a, const int3 &b) CCL_NAMESPACE_END -#endif /* __UTIL_MATH_INT3_H__ */ +#endif /* __UTIL_MATH_INT3_H__ */ diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h index 79a8c0841e7..763c42318d5 100644 --- a/intern/cycles/util/util_math_int4.h +++ b/intern/cycles/util/util_math_int4.h @@ -31,6 +31,10 @@ CCL_NAMESPACE_BEGIN ccl_device_inline int4 operator+(const int4& a, const int4& b); ccl_device_inline int4 operator+=(int4& a, const int4& b); ccl_device_inline int4 operator>>(const int4& a, int i); +ccl_device_inline int4 operator<<(const int4& a, int i); +ccl_device_inline int4 operator<(const int4& a, const int4& b); +ccl_device_inline int4 operator>=(const int4& a, const int4& b); +ccl_device_inline int4 operator&(const int4& a, const int4& b); ccl_device_inline int4 min(int4 a, int4 b); ccl_device_inline int4 max(int4 a, int4 b); ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx); @@ -65,6 +69,42 @@ ccl_device_inline int4 operator>>(const int4& a, int i) #endif } +ccl_device_inline int4 operator<<(const int4& a, int i) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_slli_epi32(a.m128, i)); +#else + return make_int4(a.x << i, a.y << i, a.z << i, a.w << i); +#endif +} + +ccl_device_inline int4 operator<(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_cmplt_epi32(a.m128, b.m128)); +#else + return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); +#endif +} + +ccl_device_inline int4 operator>=(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_xor_si128(_mm_set1_epi32(0xffffffff), _mm_cmplt_epi32(a.m128, b.m128))); +#else + return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); +#endif +} + +ccl_device_inline int4 operator&(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_and_si128(a.m128, b.m128)); +#else + return make_int4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); +#endif +} + ccl_device_inline int4 min(int4 a, int4 b) { #if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) @@ -116,4 +156,4 @@ ccl_device_inline int4 load_int4(const int *v) CCL_NAMESPACE_END -#endif /* __UTIL_MATH_INT4_H__ */ +#endif /* __UTIL_MATH_INT4_H__ */ diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h index b5fbb24091f..190c2f5d6b0 100644 --- a/intern/cycles/util/util_math_intersect.h +++ b/intern/cycles/util/util_math_intersect.h @@ -219,4 +219,4 @@ ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, CCL_NAMESPACE_END -#endif /* __UTIL_MATH_INTERSECT_H__ */ +#endif /* __UTIL_MATH_INTERSECT_H__ */ diff --git a/intern/cycles/util/util_md5.h b/intern/cycles/util/util_md5.h index 9023ccee4c2..f8c0115d8ce 100644 --- a/intern/cycles/util/util_md5.h +++ b/intern/cycles/util/util_md5.h @@ -58,4 +58,4 @@ string util_md5_string(const string& str); CCL_NAMESPACE_END -#endif /* __UTIL_MD5_H__ */ +#endif /* __UTIL_MD5_H__ */ diff --git a/intern/cycles/util/util_murmurhash.cpp b/intern/cycles/util/util_murmurhash.cpp new file mode 100644 index 00000000000..68b2f2031be --- /dev/null +++ b/intern/cycles/util/util_murmurhash.cpp @@ -0,0 +1,127 @@ +/* + * Copyright 2018 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This is taken from alShaders/Cryptomatte/MurmurHash3.h: + * + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. The author hereby disclaims copyright to this source code. + * + */ + +#include <stdlib.h> +#include <string.h> + +#include "util/util_algorithm.h" +#include "util/util_murmurhash.h" + +#if defined(_MSC_VER) +# define ROTL32(x,y) _rotl(x,y) +# define ROTL64(x,y) _rotl64(x,y) +# define BIG_CONSTANT(x) (x) +#else +ccl_device_inline uint32_t rotl32(uint32_t x, int8_t r) +{ + return (x << r) | (x >> (32 - r)); +} +# define ROTL32(x,y) rotl32(x,y) +# define BIG_CONSTANT(x) (x##LLU) +#endif + +CCL_NAMESPACE_BEGIN + +/* Block read - if your platform needs to do endian-swapping or can only + * handle aligned reads, do the conversion here. */ +ccl_device_inline uint32_t mm_hash_getblock32(const uint32_t *p, int i) +{ + return p[i]; +} + +/* Finalization mix - force all bits of a hash block to avalanche */ +ccl_device_inline uint32_t mm_hash_fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} + +uint32_t util_murmur_hash3(const void *key, int len, uint32_t seed) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) { + uint32_t k1 = mm_hash_getblock32(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1 * 5 + 0xe6546b64; + } + + const uint8_t *tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) { + case 3: + k1 ^= tail[2] << 16; + ATTR_FALLTHROUGH; + case 2: + k1 ^= tail[1] << 8; + ATTR_FALLTHROUGH; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + h1 ^= k1; + } + + h1 ^= len; + h1 = mm_hash_fmix32(h1); + return h1; +} + +/* This is taken from the cryptomatte specification 1.0 */ +float util_hash_to_float(uint32_t hash) +{ + uint32_t mantissa = hash & (( 1 << 23) - 1); + uint32_t exponent = (hash >> 23) & ((1 << 8) - 1); + exponent = max(exponent, (uint32_t) 1); + exponent = min(exponent, (uint32_t) 254); + exponent = exponent << 23; + uint32_t sign = (hash >> 31); + sign = sign << 31; + uint32_t float_bits = sign | exponent | mantissa; + float f; + memcpy(&f, &float_bits, sizeof(uint32_t)); + return f; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_murmurhash.h b/intern/cycles/util/util_murmurhash.h new file mode 100644 index 00000000000..3e7897d3ae6 --- /dev/null +++ b/intern/cycles/util/util_murmurhash.h @@ -0,0 +1,30 @@ +/* + * Copyright 2018 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef __UTIL_MURMURHASH_H__ +#define __UTIL_MURMURHASH_H__ + +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +uint32_t util_murmur_hash3(const void *key, int len, uint32_t seed); +float util_hash_to_float(uint32_t hash); + +CCL_NAMESPACE_END + +#endif /* __UTIL_MURMURHASH_H__ */ diff --git a/intern/cycles/util/util_opengl.h b/intern/cycles/util/util_opengl.h index 0b5462e0a09..2ca7b7e4c87 100644 --- a/intern/cycles/util/util_opengl.h +++ b/intern/cycles/util/util_opengl.h @@ -28,4 +28,4 @@ # define mxMakeCurrentContext(x) (x) #endif -#endif /* __UTIL_OPENGL_H__ */ +#endif /* __UTIL_OPENGL_H__ */ diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index 3b3627242d5..5267bd9a97a 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -68,4 +68,4 @@ #endif -#endif /* __UTIL_OPTIMIZATION_H__ */ +#endif /* __UTIL_OPTIMIZATION_H__ */ diff --git a/intern/cycles/util/util_param.h b/intern/cycles/util/util_param.h index 246b5cb7d63..4453c66aae2 100644 --- a/intern/cycles/util/util_param.h +++ b/intern/cycles/util/util_param.h @@ -30,4 +30,4 @@ OIIO_NAMESPACE_USING CCL_NAMESPACE_END -#endif /* __UTIL_PARAM_H__ */ +#endif /* __UTIL_PARAM_H__ */ diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp index 51b7944705e..93080a6c80c 100644 --- a/intern/cycles/util/util_path.cpp +++ b/intern/cycles/util/util_path.cpp @@ -614,7 +614,7 @@ bool path_exists(const string& path) return 0; } return st.st_mode != 0; -#endif /* _WIN32 */ +#endif /* _WIN32 */ } bool path_is_directory(const string& path) diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index 3ef15c5c09a..4ed9ebd60ff 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -114,7 +114,7 @@ public: return cancel_message; } - void set_cancel_callback(function<void(void)> function) + void set_cancel_callback(function<void()> function) { cancel_cb = function; } @@ -323,7 +323,7 @@ public: } } - void set_update_callback(function<void(void)> function) + void set_update_callback(function<void()> function) { update_cb = function; } @@ -331,8 +331,8 @@ public: protected: thread_mutex progress_mutex; thread_mutex update_mutex; - function<void(void)> update_cb; - function<void(void)> cancel_cb; + function<void()> update_cb; + function<void()> cancel_cb; /* pixel_samples counts how many samples have been rendered over all pixel, not just per pixel. * This makes the progress estimate more accurate when tiles with different sizes are used. @@ -365,4 +365,4 @@ protected: CCL_NAMESPACE_END -#endif /* __UTIL_PROGRESS_H__ */ +#endif /* __UTIL_PROGRESS_H__ */ diff --git a/intern/cycles/util/util_projection.h b/intern/cycles/util/util_projection.h index 26b4843928c..d1af013ae3a 100644 --- a/intern/cycles/util/util_projection.h +++ b/intern/cycles/util/util_projection.h @@ -169,8 +169,8 @@ ccl_device_inline ProjectionTransform projection_orthographic(float znear, float return ProjectionTransform(t); } -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_GPU__ */ CCL_NAMESPACE_END -#endif /* __UTIL_PROJECTION_H__ */ +#endif /* __UTIL_PROJECTION_H__ */ diff --git a/intern/cycles/util/util_queue.h b/intern/cycles/util/util_queue.h index 622f4fe3e47..0a2b7718f57 100644 --- a/intern/cycles/util/util_queue.h +++ b/intern/cycles/util/util_queue.h @@ -25,4 +25,4 @@ using std::queue; CCL_NAMESPACE_END -#endif /* __UTIL_LIST_H__ */ +#endif /* __UTIL_LIST_H__ */ diff --git a/intern/cycles/util/util_rect.h b/intern/cycles/util/util_rect.h index d13baefe85e..389669acf2e 100644 --- a/intern/cycles/util/util_rect.h +++ b/intern/cycles/util/util_rect.h @@ -69,4 +69,4 @@ ccl_device_inline int rect_size(int4 rect) CCL_NAMESPACE_END -#endif /* __UTIL_RECT_H__ */ +#endif /* __UTIL_RECT_H__ */ diff --git a/intern/cycles/util/util_set.h b/intern/cycles/util/util_set.h index 298e1f7729a..a9c56bb4919 100644 --- a/intern/cycles/util/util_set.h +++ b/intern/cycles/util/util_set.h @@ -31,4 +31,4 @@ using std::unordered_set; CCL_NAMESPACE_END -#endif /* __UTIL_SET_H__ */ +#endif /* __UTIL_SET_H__ */ diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index cc7f436c8fe..565ea768089 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -219,9 +219,9 @@ __forceinline size_t __bscf(size_t& v) return i; } -#endif /* __KERNEL_64_BIT__ */ +#endif /* __KERNEL_64_BIT__ */ -#else /* _WIN32 */ +#else /* _WIN32 */ __forceinline unsigned int __popcnt(unsigned int in) { int r = 0; asm ("popcnt %1,%0" : "=r"(r) : "r"(in)); return r; @@ -344,7 +344,7 @@ __forceinline size_t __bscf(size_t& v) } #endif -#endif /* _WIN32 */ +#endif /* _WIN32 */ /* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other @@ -442,7 +442,7 @@ __forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags) return value; } -#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ +#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ #else /* __KERNEL_SSE2__ */ @@ -470,7 +470,7 @@ ccl_device_inline int __bsr(int value) return bit; } -#endif /* __KERNEL_SSE2__ */ +#endif /* __KERNEL_SSE2__ */ /* quiet unused define warnings */ #if defined(__KERNEL_SSE2__) || \ @@ -484,6 +484,6 @@ ccl_device_inline int __bsr(int value) CCL_NAMESPACE_END -#endif /* __KERNEL_GPU__ */ +#endif /* __KERNEL_GPU__ */ -#endif /* __UTIL_SIMD_TYPES_H__ */ +#endif /* __UTIL_SIMD_TYPES_H__ */ diff --git a/intern/cycles/util/util_sky_model.h b/intern/cycles/util/util_sky_model.h index 237e4e61bf5..2e593f58c39 100644 --- a/intern/cycles/util/util_sky_model.h +++ b/intern/cycles/util/util_sky_model.h @@ -437,6 +437,6 @@ double arhosekskymodel_solar_radiance(ArHosekSkyModelState* state, double wavelength); -#endif // _SKY_MODEL_H_ +#endif // _SKY_MODEL_H_ CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h index 115b133c662..f6810505126 100644 --- a/intern/cycles/util/util_sseb.h +++ b/intern/cycles/util/util_sseb.h @@ -177,7 +177,7 @@ __forceinline size_t movemask( const sseb& a ) { return _mm_movemask_ps(a); } ccl_device_inline void print_sseb(const char *label, const sseb &a) { - printf("%s: %df %df %df %d\n", + printf("%s: %d %d %d %d\n", label, a[0], a[1], a[2], a[3]); } diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h index b5623860e33..66670c9a779 100644 --- a/intern/cycles/util/util_ssef.h +++ b/intern/cycles/util/util_ssef.h @@ -44,8 +44,8 @@ struct ssef __forceinline ssef& operator=(const ssef& other) { m128 = other.m128; return *this; } __forceinline ssef(const __m128 a) : m128(a) {} - __forceinline operator const __m128&(void) const { return m128; } - __forceinline operator __m128&(void) { return m128; } + __forceinline operator const __m128&() const { return m128; } + __forceinline operator __m128&() { return m128; } __forceinline ssef (float a) : m128(_mm_set1_ps(a)) {} __forceinline ssef (float a, float b, float c, float d) : m128(_mm_setr_ps(a, b, c, d)) {} @@ -517,12 +517,12 @@ ccl_device_inline float len3(const ssef& a) /* faster version for SSSE3 */ typedef ssei shuffle_swap_t; -ccl_device_inline shuffle_swap_t shuffle_swap_identity(void) +ccl_device_inline shuffle_swap_t shuffle_swap_identity() { return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); } -ccl_device_inline shuffle_swap_t shuffle_swap_swap(void) +ccl_device_inline shuffle_swap_t shuffle_swap_swap() { return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); } @@ -537,12 +537,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s /* somewhat slower version for SSE2 */ typedef int shuffle_swap_t; -ccl_device_inline shuffle_swap_t shuffle_swap_identity(void) +ccl_device_inline shuffle_swap_t shuffle_swap_identity() { return 0; } -ccl_device_inline shuffle_swap_t shuffle_swap_swap(void) +ccl_device_inline shuffle_swap_t shuffle_swap_swap() { return 1; } diff --git a/intern/cycles/util/util_stack_allocator.h b/intern/cycles/util/util_stack_allocator.h index 79a535bd170..4e978e18bee 100644 --- a/intern/cycles/util/util_stack_allocator.h +++ b/intern/cycles/util/util_stack_allocator.h @@ -53,7 +53,7 @@ public: T *allocate(size_t n, const void *hint = 0) { - (void)hint; + (void) hint; if(n == 0) { return NULL; } @@ -164,4 +164,4 @@ private: CCL_NAMESPACE_END -#endif /* __UTIL_GUARDED_ALLOCATOR_H__ */ +#endif /* __UTIL_STACK_ALLOCATOR_H__ */ diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h index dc3cb3f6ecc..b1c6c374693 100644 --- a/intern/cycles/util/util_static_assert.h +++ b/intern/cycles/util/util_static_assert.h @@ -47,4 +47,4 @@ CCL_NAMESPACE_BEGIN CCL_NAMESPACE_END -#endif /* __UTIL_STATIC_ASSERT_H__ */ +#endif /* __UTIL_STATIC_ASSERT_H__ */ diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h index 7667f58eb7d..0ba58422a67 100644 --- a/intern/cycles/util/util_stats.h +++ b/intern/cycles/util/util_stats.h @@ -44,4 +44,4 @@ public: CCL_NAMESPACE_END -#endif /* __UTIL_STATS_H__ */ +#endif /* __UTIL_STATS_H__ */ diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h index 3a4f4398158..f17112c30d2 100644 --- a/intern/cycles/util/util_string.h +++ b/intern/cycles/util/util_string.h @@ -74,4 +74,4 @@ string string_human_readable_number(size_t num); CCL_NAMESPACE_END -#endif /* __UTIL_STRING_H__ */ +#endif /* __UTIL_STRING_H__ */ diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index 1b039888452..34f428f111c 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -52,14 +52,14 @@ int system_cpu_group_thread_count(int group) util_windows_init_numa_groups(); return GetActiveProcessorCount(group); #elif defined(__APPLE__) - (void)group; + (void) group; int count; size_t len = sizeof(count); int mib[2] = { CTL_HW, HW_NCPU }; sysctl(mib, 2, &count, &len, NULL, 0); return count; #else - (void)group; + (void) group; return sysconf(_SC_NPROCESSORS_ONLN); #endif } diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index 42ad72356b9..241ac897157 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -46,4 +46,4 @@ size_t system_physical_ram(); CCL_NAMESPACE_END -#endif /* __UTIL_SYSTEM_H__ */ +#endif /* __UTIL_SYSTEM_H__ */ diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h index f752e81128d..233cfe33305 100644 --- a/intern/cycles/util/util_texture.h +++ b/intern/cycles/util/util_texture.h @@ -89,4 +89,4 @@ typedef struct TextureInfo { CCL_NAMESPACE_END -#endif /* __UTIL_TEXTURE_H__ */ +#endif /* __UTIL_TEXTURE_H__ */ diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp index 16a8591a8a9..37d8bdbd4b0 100644 --- a/intern/cycles/util/util_thread.cpp +++ b/intern/cycles/util/util_thread.cpp @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN -thread::thread(function<void(void)> run_cb, int group) +thread::thread(function<void()> run_cb, int group) : run_cb_(run_cb), joined_(false), group_(group) diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h index f39fcfb4279..6250bb95dcf 100644 --- a/intern/cycles/util/util_thread.h +++ b/intern/cycles/util/util_thread.h @@ -46,14 +46,14 @@ typedef std::condition_variable thread_condition_variable; class thread { public: - thread(function<void(void)> run_cb, int group = -1); + thread(function<void()> run_cb, int group = -1); ~thread(); static void *run(void *arg); bool join(); protected: - function<void(void)> run_cb_; + function<void()> run_cb_; std::thread thread_; bool joined_; int group_; @@ -138,4 +138,4 @@ protected: CCL_NAMESPACE_END -#endif /* __UTIL_THREAD_H__ */ +#endif /* __UTIL_THREAD_H__ */ diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h index e781f85dded..e4cadd3e81a 100644 --- a/intern/cycles/util/util_transform.h +++ b/intern/cycles/util/util_transform.h @@ -424,6 +424,31 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm, #ifndef __KERNEL_GPU__ +#ifdef WITH_EMBREE +ccl_device void transform_motion_array_interpolate_straight(Transform *tfm, + const ccl_global DecomposedTransform *motion, + uint numsteps, + float time) +{ + /* Figure out which steps we need to interpolate. */ + int maxstep = numsteps - 1; + int step = min((int)(time*maxstep), maxstep - 1); + float t = time * maxstep - step; + + const ccl_global DecomposedTransform *a = motion + step; + const ccl_global DecomposedTransform *b = motion + step + 1; + Transform step1, step2; + + transform_compose(&step1, a); + transform_compose(&step2, b); + + /* matrix lerp */ + tfm->x = (1.0f - t) * step1.x + t * step2.x; + tfm->y = (1.0f - t) * step1.y + t * step2.y; + tfm->z = (1.0f - t) * step1.z + t * step2.z; +} +#endif + class BoundBox2D; ccl_device_inline bool operator==(const DecomposedTransform& A, const DecomposedTransform& B) @@ -470,4 +495,4 @@ OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction_transposed) CCL_NAMESPACE_END -#endif /* __UTIL_TRANSFORM_H__ */ +#endif /* __UTIL_TRANSFORM_H__ */ diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 96c549b9be5..535048d8f8c 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -156,4 +156,4 @@ CCL_NAMESPACE_END #endif #endif -#endif /* __UTIL_TYPES_H__ */ +#endif /* __UTIL_TYPES_H__ */ diff --git a/intern/cycles/util/util_types_float3.h b/intern/cycles/util/util_types_float3.h index 28146ad04f7..ed2300e7996 100644 --- a/intern/cycles/util/util_types_float3.h +++ b/intern/cycles/util/util_types_float3.h @@ -35,8 +35,8 @@ struct ccl_try_align(16) float3 { __forceinline float3(const float3& a); __forceinline explicit float3(const __m128& a); - __forceinline operator const __m128&(void) const; - __forceinline operator __m128&(void); + __forceinline operator const __m128&() const; + __forceinline operator __m128&(); __forceinline float3& operator =(const float3& a); #else /* __KERNEL_SSE__ */ diff --git a/intern/cycles/util/util_types_float3_impl.h b/intern/cycles/util/util_types_float3_impl.h index 45f61767d3f..2e840a5c399 100644 --- a/intern/cycles/util/util_types_float3_impl.h +++ b/intern/cycles/util/util_types_float3_impl.h @@ -43,12 +43,12 @@ __forceinline float3::float3(const __m128& a) { } -__forceinline float3::operator const __m128&(void) const +__forceinline float3::operator const __m128&() const { return m128; } -__forceinline float3::operator __m128&(void) +__forceinline float3::operator __m128&() { return m128; } diff --git a/intern/cycles/util/util_types_float4.h b/intern/cycles/util/util_types_float4.h index 154391f6881..5c10d483c2e 100644 --- a/intern/cycles/util/util_types_float4.h +++ b/intern/cycles/util/util_types_float4.h @@ -36,8 +36,8 @@ struct ccl_try_align(16) float4 { __forceinline float4(); __forceinline explicit float4(const __m128& a); - __forceinline operator const __m128&(void) const; - __forceinline operator __m128&(void); + __forceinline operator const __m128&() const; + __forceinline operator __m128&(); __forceinline float4& operator =(const float4& a); diff --git a/intern/cycles/util/util_types_float4_impl.h b/intern/cycles/util/util_types_float4_impl.h index a49fac65b10..a83148031f1 100644 --- a/intern/cycles/util/util_types_float4_impl.h +++ b/intern/cycles/util/util_types_float4_impl.h @@ -38,12 +38,12 @@ __forceinline float4::float4(const __m128& a) { } -__forceinline float4::operator const __m128&(void) const +__forceinline float4::operator const __m128&() const { return m128; } -__forceinline float4::operator __m128&(void) +__forceinline float4::operator __m128&() { return m128; } diff --git a/intern/cycles/util/util_types_float8.h b/intern/cycles/util/util_types_float8.h index 64ec5a71355..08720b8ff48 100644 --- a/intern/cycles/util/util_types_float8.h +++ b/intern/cycles/util/util_types_float8.h @@ -37,7 +37,7 @@ CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ -struct ccl_try_align(16) float8 { +struct ccl_try_align(32) float8 { #ifdef __KERNEL_AVX2__ union { __m256 m256; @@ -48,8 +48,8 @@ struct ccl_try_align(16) float8 { __forceinline float8(const float8& a); __forceinline explicit float8(const __m256& a); - __forceinline operator const __m256&(void) const; - __forceinline operator __m256&(void); + __forceinline operator const __m256&() const; + __forceinline operator __m256&(); __forceinline float8& operator =(const float8& a); diff --git a/intern/cycles/util/util_types_float8_impl.h b/intern/cycles/util/util_types_float8_impl.h index ebf8260bc7c..84fe233c334 100644 --- a/intern/cycles/util/util_types_float8_impl.h +++ b/intern/cycles/util/util_types_float8_impl.h @@ -55,12 +55,12 @@ __forceinline float8::float8(const __m256& f) { } -__forceinline float8::operator const __m256&(void) const +__forceinline float8::operator const __m256&() const { return m256; } -__forceinline float8::operator __m256&(void) +__forceinline float8::operator __m256&() { return m256; } diff --git a/intern/cycles/util/util_types_int3.h b/intern/cycles/util/util_types_int3.h index 9d43b201c02..f68074b982b 100644 --- a/intern/cycles/util/util_types_int3.h +++ b/intern/cycles/util/util_types_int3.h @@ -35,8 +35,8 @@ struct ccl_try_align(16) int3 { __forceinline int3(const int3& a); __forceinline explicit int3(const __m128i& a); - __forceinline operator const __m128i&(void) const; - __forceinline operator __m128i&(void); + __forceinline operator const __m128i&() const; + __forceinline operator __m128i&(); __forceinline int3& operator =(const int3& a); #else /* __KERNEL_SSE__ */ diff --git a/intern/cycles/util/util_types_int3_impl.h b/intern/cycles/util/util_types_int3_impl.h index ada50c4812c..1b195ca753f 100644 --- a/intern/cycles/util/util_types_int3_impl.h +++ b/intern/cycles/util/util_types_int3_impl.h @@ -43,12 +43,12 @@ __forceinline int3::int3(const int3& a) { } -__forceinline int3::operator const __m128i&(void) const +__forceinline int3::operator const __m128i&() const { return m128; } -__forceinline int3::operator __m128i&(void) +__forceinline int3::operator __m128i&() { return m128; } diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h index cdd0ecbdae5..52e6fed8c14 100644 --- a/intern/cycles/util/util_types_int4.h +++ b/intern/cycles/util/util_types_int4.h @@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN #ifndef __KERNEL_GPU__ struct float3; +struct float4; struct ccl_try_align(16) int4 { #ifdef __KERNEL_SSE__ @@ -38,8 +39,8 @@ struct ccl_try_align(16) int4 { __forceinline int4(const int4& a); __forceinline explicit int4(const __m128i& a); - __forceinline operator const __m128i&(void) const; - __forceinline operator __m128i&(void); + __forceinline operator const __m128i&() const; + __forceinline operator __m128i&(); __forceinline int4& operator=(const int4& a); #else /* __KERNEL_SSE__ */ @@ -53,6 +54,7 @@ struct ccl_try_align(16) int4 { ccl_device_inline int4 make_int4(int i); ccl_device_inline int4 make_int4(int x, int y, int z, int w); ccl_device_inline int4 make_int4(const float3& f); +ccl_device_inline int4 make_int4(const float4& f); ccl_device_inline void print_int4(const char *label, const int4& a); #endif /* __KERNEL_GPU__ */ diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h index 07cdc88f2dc..c058f86c400 100644 --- a/intern/cycles/util/util_types_int4_impl.h +++ b/intern/cycles/util/util_types_int4_impl.h @@ -43,12 +43,12 @@ __forceinline int4::int4(const __m128i& a) { } -__forceinline int4::operator const __m128i&(void) const +__forceinline int4::operator const __m128i&() const { return m128; } -__forceinline int4::operator __m128i&(void) +__forceinline int4::operator __m128i&() { return m128; } @@ -104,6 +104,16 @@ ccl_device_inline int4 make_int4(const float3& f) return a; } +ccl_device_inline int4 make_int4(const float4& f) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_cvtps_epi32(f.m128)); +#else + int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; +#endif + return a; +} + ccl_device_inline void print_int4(const char *label, const int4& a) { printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h index 0b33221ad4d..18fa231d6e7 100644 --- a/intern/cycles/util/util_vector.h +++ b/intern/cycles/util/util_vector.h @@ -17,8 +17,6 @@ #ifndef __UTIL_VECTOR_H__ #define __UTIL_VECTOR_H__ -/* Vector */ - #include <cassert> #include <cstring> #include <vector> @@ -29,12 +27,9 @@ CCL_NAMESPACE_BEGIN -/* Vector - * - * Own subclass-ed vestion of std::vector. Subclass is needed because: +/* Own subclass-ed vestion of std::vector. Subclass is needed because: * * - Use own allocator which keeps track of used/peak memory. - * * - Have method to ensure capacity is re-set to 0. */ template<typename value_type, @@ -42,30 +37,16 @@ template<typename value_type, class vector : public std::vector<value_type, allocator_type> { public: - /* Default constructor. */ - explicit vector() : std::vector<value_type, allocator_type>() { } - - /* Fill constructor. */ - explicit vector(size_t n, const value_type& val = value_type()) - : std::vector<value_type, allocator_type>(n, val) { } - - /* Range constructor. */ - template <class InputIterator> - vector(InputIterator first, InputIterator last) - : std::vector<value_type, allocator_type>(first, last) { } - - /* Copy constructor. */ - vector(const vector &x) : std::vector<value_type, allocator_type>(x) { } + typedef std::vector<value_type, allocator_type> BaseClass; - void shrink_to_fit(void) - { - std::vector<value_type, allocator_type>::shrink_to_fit(); - } + /* Inherit all constructors from base class. */ + using BaseClass::vector; - void free_memory(void) + /* Try as hard as possible to use zero memory. */ + void free_memory() { - std::vector<value_type, allocator_type>::resize(0); - shrink_to_fit(); + BaseClass::resize(0); + BaseClass::shrink_to_fit(); } /* Some external API might demand working with std::vector. */ @@ -75,265 +56,6 @@ public: } }; -/* Array - * - * Simplified version of vector, serving multiple purposes: - * - somewhat faster in that it does not clear memory on resize/alloc, - * this was actually showing up in profiles quite significantly. it - * also does not run any constructors/destructors - * - if this is used, we are not tempted to use inefficient operations - * - aligned allocation for CPU native data types */ - -template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES> -class array -{ -public: - array() - : data_(NULL), - datasize_(0), - capacity_(0) - {} - - explicit array(size_t newsize) - { - if(newsize == 0) { - data_ = NULL; - datasize_ = 0; - capacity_ = 0; - } - else { - data_ = mem_allocate(newsize); - datasize_ = newsize; - capacity_ = datasize_; - } - } - - array(const array& from) - { - if(from.datasize_ == 0) { - data_ = NULL; - datasize_ = 0; - capacity_ = 0; - } - else { - data_ = mem_allocate(from.datasize_); - memcpy(data_, from.data_, from.datasize_*sizeof(T)); - datasize_ = from.datasize_; - capacity_ = datasize_; - } - } - - array& operator=(const array& from) - { - if(this != &from) { - resize(from.size()); - memcpy((void*)data_, from.data_, datasize_*sizeof(T)); - } - - return *this; - } - - array& operator=(const vector<T>& from) - { - resize(from.size()); - - if(from.size() > 0) { - memcpy(data_, &from[0], datasize_*sizeof(T)); - } - - return *this; - } - - ~array() - { - mem_free(data_, capacity_); - } - - bool operator==(const array<T>& other) const - { - if(datasize_ != other.datasize_) { - return false; - } - - return memcmp(data_, other.data_, datasize_*sizeof(T)) == 0; - } - - bool operator!=(const array<T>& other) const - { - return !(*this == other); - } - - void steal_data(array& from) - { - if(this != &from) { - clear(); - - data_ = from.data_; - datasize_ = from.datasize_; - capacity_ = from.capacity_; - - from.data_ = NULL; - from.datasize_ = 0; - from.capacity_ = 0; - } - } - - T *steal_pointer() - { - T *ptr = data_; - data_ = NULL; - clear(); - return ptr; - } - - T* resize(size_t newsize) - { - if(newsize == 0) { - clear(); - } - else if(newsize != datasize_) { - if(newsize > capacity_) { - T *newdata = mem_allocate(newsize); - if(newdata == NULL) { - /* Allocation failed, likely out of memory. */ - clear(); - return NULL; - } - else if(data_ != NULL) { - memcpy((void *)newdata, - data_, - ((datasize_ < newsize)? datasize_: newsize)*sizeof(T)); - mem_free(data_, capacity_); - } - data_ = newdata; - capacity_ = newsize; - } - datasize_ = newsize; - } - return data_; - } - - T* resize(size_t newsize, const T& value) - { - size_t oldsize = size(); - resize(newsize); - - for(size_t i = oldsize; i < size(); i++) { - data_[i] = value; - } - - return data_; - } - - void clear() - { - if(data_ != NULL) { - mem_free(data_, capacity_); - data_ = NULL; - } - datasize_ = 0; - capacity_ = 0; - } - - size_t empty() const - { - return datasize_ == 0; - } - - size_t size() const - { - return datasize_; - } - - T* data() - { - return data_; - } - - const T* data() const - { - return data_; - } - - T& operator[](size_t i) const - { - assert(i < datasize_); - return data_[i]; - } - - void reserve(size_t newcapacity) - { - if(newcapacity > capacity_) { - T *newdata = mem_allocate(newcapacity); - if(data_ != NULL) { - memcpy(newdata, data_, ((datasize_ < newcapacity)? datasize_: newcapacity)*sizeof(T)); - mem_free(data_, capacity_); - } - data_ = newdata; - capacity_ = newcapacity; - } - } - - size_t capacity() const - { - return capacity_; - } - - // do not use this method unless you are sure the code is not performance critical - void push_back_slow(const T& t) - { - if(capacity_ == datasize_) - { - reserve(datasize_ == 0 ? 1 : (size_t)((datasize_ + 1) * 1.2)); - } - - data_[datasize_++] = t; - } - - void push_back_reserved(const T& t) - { - assert(datasize_ < capacity_); - push_back_slow(t); - } - - void append(const array<T>& from) - { - if(from.size()) { - size_t old_size = size(); - resize(old_size + from.size()); - memcpy(data_ + old_size, from.data(), sizeof(T) * from.size()); - } - } - -protected: - inline T* mem_allocate(size_t N) - { - if(N == 0) { - return NULL; - } - T *mem = (T*)util_aligned_malloc(sizeof(T)*N, alignment); - if(mem != NULL) { - util_guarded_mem_alloc(sizeof(T)*N); - } - else { - throw std::bad_alloc(); - } - return mem; - } - - inline void mem_free(T *mem, size_t N) - { - if(mem != NULL) { - util_guarded_mem_free(sizeof(T)*N); - util_aligned_free(mem); - } - } - - T *data_; - size_t datasize_; - size_t capacity_; -}; - CCL_NAMESPACE_END -#endif /* __UTIL_VECTOR_H__ */ +#endif /* __UTIL_VECTOR_H__ */ diff --git a/intern/cycles/util/util_version.h b/intern/cycles/util/util_version.h index 112255f447b..980c5a269e6 100644 --- a/intern/cycles/util/util_version.h +++ b/intern/cycles/util/util_version.h @@ -34,4 +34,4 @@ CCL_NAMESPACE_BEGIN CCL_NAMESPACE_END -#endif /* __UTIL_VERSION_H__ */ +#endif /* __UTIL_VERSION_H__ */ diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp index 3836cc86ee0..9dffd7a80bd 100644 --- a/intern/cycles/util/util_view.cpp +++ b/intern/cycles/util/util_view.cpp @@ -215,7 +215,7 @@ static void view_motion(int x, int y) V.mouseY = y; } -static void view_idle(void) +static void view_idle() { if(V.redraw) { V.redraw = false; diff --git a/intern/cycles/util/util_view.h b/intern/cycles/util/util_view.h index e603e605776..ae50b098b39 100644 --- a/intern/cycles/util/util_view.h +++ b/intern/cycles/util/util_view.h @@ -22,10 +22,10 @@ CCL_NAMESPACE_BEGIN -typedef void (*ViewInitFunc)(void); -typedef void (*ViewExitFunc)(void); +typedef void (*ViewInitFunc)(); +typedef void (*ViewExitFunc)(); typedef void (*ViewResizeFunc)(int width, int height); -typedef void (*ViewDisplayFunc)(void); +typedef void (*ViewDisplayFunc)(); typedef void (*ViewKeyboardFunc)(unsigned char key); typedef void (*ViewMotionFunc)(int x, int y, int button); @@ -40,4 +40,4 @@ void view_redraw(); CCL_NAMESPACE_END -#endif /*__UTIL_VIEW_H__*/ +#endif /*__UTIL_VIEW_H__*/ diff --git a/intern/cycles/util/util_windows.h b/intern/cycles/util/util_windows.h index 3b23ce8a3cb..9b9268fed7a 100644 --- a/intern/cycles/util/util_windows.h +++ b/intern/cycles/util/util_windows.h @@ -56,4 +56,4 @@ CCL_NAMESPACE_END #endif /* WIN32 */ -#endif /* __UTIL_WINDOWS_H__ */ +#endif /* __UTIL_WINDOWS_H__ */ diff --git a/intern/cycles/util/util_xml.h b/intern/cycles/util/util_xml.h index 6f06f17937b..c8a3a495f30 100644 --- a/intern/cycles/util/util_xml.h +++ b/intern/cycles/util/util_xml.h @@ -38,4 +38,4 @@ using PUGIXML_NAMESPACE::xml_parse_result; CCL_NAMESPACE_END -#endif /* __UTIL_XML_H__ */ +#endif /* __UTIL_XML_H__ */ diff --git a/intern/elbeem/extern/LBM_fluidsim.h b/intern/elbeem/extern/LBM_fluidsim.h index 374e11d1c06..05ac143c7a1 100644 --- a/intern/elbeem/extern/LBM_fluidsim.h +++ b/intern/elbeem/extern/LBM_fluidsim.h @@ -1,7 +1,4 @@ /* - * BKE_fluidsim.h - * - * * ***** BEGIN GPL LICENSE BLOCK ***** * * This program is free software; you can redistribute it and/or @@ -44,5 +41,3 @@ int performElbeemSimulation(char *cfgfilename); #endif - - diff --git a/intern/ghost/GHOST_C-api.h b/intern/ghost/GHOST_C-api.h index 86e64b7eeb0..bc5c67f4bed 100644 --- a/intern/ghost/GHOST_C-api.h +++ b/intern/ghost/GHOST_C-api.h @@ -908,6 +908,11 @@ extern int GHOST_SupportsNativeDialogs(void); extern int GHOST_UseNativePixels(void); /** + * Focus window after opening, or put them in the background. + */ +extern void GHOST_UseWindowFocus(int use_focus); + +/** * If window was opened using native pixel size, it returns scaling factor. */ extern float GHOST_GetNativePixelSize(GHOST_WindowHandle windowhandle); diff --git a/intern/ghost/GHOST_ISystem.h b/intern/ghost/GHOST_ISystem.h index be2a94bd508..9b017c2da38 100644 --- a/intern/ghost/GHOST_ISystem.h +++ b/intern/ghost/GHOST_ISystem.h @@ -306,6 +306,11 @@ public: */ virtual bool useNativePixel(void) = 0; + /** + * Focus window after opening, or put them in the background. + */ + virtual void useWindowFocus(const bool use_focus) = 0; + /*************************************************************************************** * Event management functionality ***************************************************************************************/ diff --git a/intern/ghost/intern/GHOST_C-api.cpp b/intern/ghost/intern/GHOST_C-api.cpp index 797fb16dd6f..e60a8a9d3f3 100644 --- a/intern/ghost/intern/GHOST_C-api.cpp +++ b/intern/ghost/intern/GHOST_C-api.cpp @@ -911,6 +911,12 @@ int GHOST_UseNativePixels(void) return system->useNativePixel(); } +void GHOST_UseWindowFocus(int use_focus) +{ + GHOST_ISystem *system = GHOST_ISystem::getSystem(); + return system->useWindowFocus(use_focus); +} + float GHOST_GetNativePixelSize(GHOST_WindowHandle windowhandle) { GHOST_IWindow *window = (GHOST_IWindow *) windowhandle; diff --git a/intern/ghost/intern/GHOST_DropTargetWin32.h b/intern/ghost/intern/GHOST_DropTargetWin32.h index 3d7be45799f..5dcefcaf25e 100644 --- a/intern/ghost/intern/GHOST_DropTargetWin32.h +++ b/intern/ghost/intern/GHOST_DropTargetWin32.h @@ -60,7 +60,7 @@ public: * - Determining the effect of the drop on the target application. * - Incorporating any valid dropped data when the drop occurs. * - Communicating target feedback to the source so the source application - * can provide appropriate visual feedback such as setting the cursor. + * can provide appropriate visual feedback such as setting the cursor. * - Implementing drag scrolling. * - Registering and revoking its application windows as drop targets. * diff --git a/intern/ghost/intern/GHOST_EventTrackpad.h b/intern/ghost/intern/GHOST_EventTrackpad.h index faf0f1697d0..25988090c01 100644 --- a/intern/ghost/intern/GHOST_EventTrackpad.h +++ b/intern/ghost/intern/GHOST_EventTrackpad.h @@ -20,8 +20,8 @@ * * The Original Code is: all of this file. * - * Contributor(s): James Deery 11/2009 - * Damien Plisson 12/2009 + * Contributor(s): James Deery 11/2009 + * Damien Plisson 12/2009 * * ***** END GPL LICENSE BLOCK ***** */ diff --git a/intern/ghost/intern/GHOST_System.cpp b/intern/ghost/intern/GHOST_System.cpp index 0629eacc3ff..39f915bb0c2 100644 --- a/intern/ghost/intern/GHOST_System.cpp +++ b/intern/ghost/intern/GHOST_System.cpp @@ -48,6 +48,7 @@ GHOST_System::GHOST_System() : m_nativePixel(false), + m_windowFocus(true), m_displayManager(NULL), m_timerManager(NULL), m_windowManager(NULL), @@ -390,3 +391,8 @@ bool GHOST_System::useNativePixel(void) m_nativePixel = true; return 1; } + +void GHOST_System::useWindowFocus(const bool use_focus) +{ + m_windowFocus = use_focus; +} diff --git a/intern/ghost/intern/GHOST_System.h b/intern/ghost/intern/GHOST_System.h index 464d9269f28..ee3c30c35b4 100644 --- a/intern/ghost/intern/GHOST_System.h +++ b/intern/ghost/intern/GHOST_System.h @@ -177,6 +177,12 @@ public: bool useNativePixel(void); bool m_nativePixel; + /** + * Focus window after opening, or put them in the background. + */ + void useWindowFocus(const bool use_focus); + bool m_windowFocus; + /*************************************************************************************** * Event management functionality ***************************************************************************************/ diff --git a/intern/ghost/intern/GHOST_SystemCocoa.h b/intern/ghost/intern/GHOST_SystemCocoa.h index d31df79bc2b..146fa841555 100644 --- a/intern/ghost/intern/GHOST_SystemCocoa.h +++ b/intern/ghost/intern/GHOST_SystemCocoa.h @@ -20,8 +20,8 @@ * * The Original Code is: all of this file. * - * Contributor(s): Maarten Gribnau 05/2001 - * Damien Plisson 09/2009 + * Contributor(s): Maarten Gribnau 05/2001 + * Damien Plisson 09/2009 * * ***** END GPL LICENSE BLOCK ***** */ diff --git a/intern/ghost/intern/GHOST_SystemCocoa.mm b/intern/ghost/intern/GHOST_SystemCocoa.mm index 916da546669..697c0fae809 100644 --- a/intern/ghost/intern/GHOST_SystemCocoa.mm +++ b/intern/ghost/intern/GHOST_SystemCocoa.mm @@ -304,11 +304,13 @@ extern "C" int GHOST_HACK_getFirstFile(char buf[FIRSTFILEBUFLG]) - (void)applicationDidFinishLaunching:(NSNotification *)aNotification { - // raise application to front, convenient when starting from the terminal - // and important for launching the animation player. we call this after the - // application finishes launching, as doing it earlier can make us end up - // with a frontmost window but an inactive application - [NSApp activateIgnoringOtherApps:YES]; + if (systemCocoa->m_windowFocus) { + // Raise application to front, convenient when starting from the terminal + // and important for launching the animation player. we call this after the + // application finishes launching, as doing it earlier can make us end up + // with a frontmost window but an inactive application. + [NSApp activateIgnoringOtherApps:YES]; + } } - (BOOL)application:(NSApplication *)theApplication openFile:(NSString *)filename diff --git a/intern/ghost/intern/GHOST_SystemX11.h b/intern/ghost/intern/GHOST_SystemX11.h index 3a65ff2a843..e9312ceb683 100644 --- a/intern/ghost/intern/GHOST_SystemX11.h +++ b/intern/ghost/intern/GHOST_SystemX11.h @@ -149,16 +149,16 @@ public: * Create a new window. * The new window is added to the list of windows managed. * Never explicitly delete the window, use disposeWindow() instead. - * \param title The name of the window (displayed in the title bar of the window if the OS supports it). - * \param left The coordinate of the left edge of the window. - * \param top The coordinate of the top edge of the window. - * \param width The width the window. - * \param height The height the window. - * \param state The state of the window when opened. - * \param type The type of drawing context installed in this window. - * \param stereoVisual Create a stereo visual for quad buffered stereo. - * \param exclusive Use to show the window ontop and ignore others - * (used fullscreen). + * \param title The name of the window (displayed in the title bar of the window if the OS supports it). + * \param left The coordinate of the left edge of the window. + * \param top The coordinate of the top edge of the window. + * \param width The width the window. + * \param height The height the window. + * \param state The state of the window when opened. + * \param type The type of drawing context installed in this window. + * \param stereoVisual Create a stereo visual for quad buffered stereo. + * \param exclusive Use to show the window ontop and ignore others + * (used fullscreen). * \param parentWindow Parent (embedder) window * \return The new window (or 0 if creation failed). */ diff --git a/intern/ghost/intern/GHOST_Window.h b/intern/ghost/intern/GHOST_Window.h index 59d3123b9de..2fa30049eca 100644 --- a/intern/ghost/intern/GHOST_Window.h +++ b/intern/ghost/intern/GHOST_Window.h @@ -55,14 +55,14 @@ public: * Constructor. * Creates a new window and opens it. * To check if the window was created properly, use the getValid() method. - * \param width The width the window. - * \param heigh The height the window. - * \param state The state the window is initially opened with. - * \param type The type of drawing context installed in this window. - * \param stereoVisual Stereo visual for quad buffered stereo. - * \param exclusive Use to show the window ontop and ignore others - * (used fullscreen). - * \param numOfAASamples Number of samples used for AA (zero if no AA) + * \param width The width the window. + * \param heigh The height the window. + * \param state The state the window is initially opened with. + * \param type The type of drawing context installed in this window. + * \param stereoVisual Stereo visual for quad buffered stereo. + * \param exclusive Use to show the window ontop and ignore others + * (used fullscreen). + * \param numOfAASamples Number of samples used for AA (zero if no AA) */ GHOST_Window( GHOST_TUns32 width, diff --git a/intern/ghost/intern/GHOST_WindowCocoa.mm b/intern/ghost/intern/GHOST_WindowCocoa.mm index 20060ac1267..79d2f304e60 100644 --- a/intern/ghost/intern/GHOST_WindowCocoa.mm +++ b/intern/ghost/intern/GHOST_WindowCocoa.mm @@ -551,7 +551,15 @@ GHOST_WindowCocoa::GHOST_WindowCocoa( //Creates the OpenGL View inside the window m_openGLView = [[CocoaOpenGLView alloc] initWithFrame:rect]; - + + if (m_systemCocoa->m_nativePixel) { + // Needs to happen early when building with the 10.14 SDK, otherwise + // has no effect until resizeing the window. + if ([m_openGLView respondsToSelector:@selector(setWantsBestResolutionOpenGLSurface:)]) { + [m_openGLView setWantsBestResolutionOpenGLSurface:YES]; + } + } + [m_openGLView setSystemAndWindowCocoa:systemCocoa windowCocoa:this]; [m_window setContentView:m_openGLView]; @@ -563,14 +571,9 @@ GHOST_WindowCocoa::GHOST_WindowCocoa( updateDrawingContext(); activateDrawingContext(); - // XXX jwilkins: This seems like it belongs in GHOST_ContextCGL, but probably not GHOST_ContextEGL if (m_systemCocoa->m_nativePixel) { - if ([m_openGLView respondsToSelector:@selector(setWantsBestResolutionOpenGLSurface:)]) { - [m_openGLView setWantsBestResolutionOpenGLSurface:YES]; - - NSRect backingBounds = [m_openGLView convertRectToBacking:[m_openGLView bounds]]; - m_nativePixelSize = (float)backingBounds.size.width / (float)rect.size.width; - } + NSRect backingBounds = [m_openGLView convertRectToBacking:[m_openGLView bounds]]; + m_nativePixelSize = (float)backingBounds.size.width / (float)rect.size.width; } setTitle(title); diff --git a/intern/ghost/intern/GHOST_WindowWin32.cpp b/intern/ghost/intern/GHOST_WindowWin32.cpp index 92de41a859b..983fffc10e6 100644 --- a/intern/ghost/intern/GHOST_WindowWin32.cpp +++ b/intern/ghost/intern/GHOST_WindowWin32.cpp @@ -201,6 +201,11 @@ GHOST_WindowWin32::GHOST_WindowWin32(GHOST_SystemWin32 *system, // Store a pointer to this class in the window structure ::SetWindowLongPtr(m_hWnd, GWLP_USERDATA, (LONG_PTR) this); + if (!m_system->m_windowFocus) { + // Lower to bottom and don't activate if we don't want focus + ::SetWindowPos(m_hWnd, HWND_BOTTOM, 0, 0, 0, 0, SWP_NOMOVE | SWP_NOSIZE | SWP_NOACTIVATE); + } + // Store the device context m_hDC = ::GetDC(m_hWnd); @@ -214,11 +219,11 @@ GHOST_WindowWin32::GHOST_WindowWin32(GHOST_SystemWin32 *system, nCmdShow = SW_SHOWMAXIMIZED; break; case GHOST_kWindowStateMinimized: - nCmdShow = SW_SHOWMINIMIZED; + nCmdShow = (m_system->m_windowFocus) ? SW_SHOWMINIMIZED : SW_SHOWMINNOACTIVE; break; case GHOST_kWindowStateNormal: default: - nCmdShow = SW_SHOWNORMAL; + nCmdShow = (m_system->m_windowFocus) ? SW_SHOWNORMAL : SW_SHOWNOACTIVATE; break; } @@ -1105,12 +1110,12 @@ GHOST_TSuccess GHOST_WindowWin32::endProgressBar() #ifdef WITH_INPUT_IME void GHOST_WindowWin32::beginIME(GHOST_TInt32 x, GHOST_TInt32 y, GHOST_TInt32 w, GHOST_TInt32 h, int completed) { - m_imeImput.BeginIME(m_hWnd, GHOST_Rect(x, y - h, x, y), (bool)completed); + m_imeInput.BeginIME(m_hWnd, GHOST_Rect(x, y - h, x, y), (bool)completed); } void GHOST_WindowWin32::endIME() { - m_imeImput.EndIME(m_hWnd); + m_imeInput.EndIME(m_hWnd); } #endif /* WITH_INPUT_IME */ diff --git a/intern/ghost/intern/GHOST_WindowWin32.h b/intern/ghost/intern/GHOST_WindowWin32.h index c72669ed898..8b0ba2f1934 100644 --- a/intern/ghost/intern/GHOST_WindowWin32.h +++ b/intern/ghost/intern/GHOST_WindowWin32.h @@ -225,10 +225,10 @@ public: * capturing). * * \param press - * 0 - mouse pressed - * 1 - mouse released - * 2 - operator grab - * 3 - operator ungrab + * 0 - mouse pressed + * 1 - mouse released + * 2 - operator grab + * 3 - operator ungrab */ void registerMouseClickEvent(int press); @@ -265,7 +265,7 @@ public: bool m_inLiveResize; #ifdef WITH_INPUT_IME - GHOST_ImeWin32 *getImeInput() {return &m_imeImput;} + GHOST_ImeWin32 *getImeInput() {return &m_imeInput;} void beginIME( GHOST_TInt32 x, GHOST_TInt32 y, @@ -369,7 +369,7 @@ private: #ifdef WITH_INPUT_IME /** Handle input method editors event */ - GHOST_ImeWin32 m_imeImput; + GHOST_ImeWin32 m_imeInput; #endif bool m_debug_context; }; diff --git a/intern/ghost/intern/GHOST_WindowX11.cpp b/intern/ghost/intern/GHOST_WindowX11.cpp index 623d57705b2..a4ccdef3788 100644 --- a/intern/ghost/intern/GHOST_WindowX11.cpp +++ b/intern/ghost/intern/GHOST_WindowX11.cpp @@ -517,7 +517,7 @@ GHOST_WindowX11(GHOST_SystemX11 *system, natom++; } - if (m_system->m_atom.WM_TAKE_FOCUS) { + if (m_system->m_atom.WM_TAKE_FOCUS && m_system->m_windowFocus) { atoms[natom] = m_system->m_atom.WM_TAKE_FOCUS; natom++; } @@ -532,7 +532,7 @@ GHOST_WindowX11(GHOST_SystemX11 *system, { XWMHints *xwmhints = XAllocWMHints(); xwmhints->initial_state = NormalState; - xwmhints->input = True; + xwmhints->input = (m_system->m_windowFocus) ? True : False; xwmhints->flags = InputHint | StateHint; XSetWMHints(display, m_window, xwmhints); XFree(xwmhints); @@ -586,11 +586,15 @@ GHOST_WindowX11(GHOST_SystemX11 *system, setTitle(title); - if (exclusive) { + if (exclusive && system->m_windowFocus) { XMapRaised(m_display, m_window); } else { XMapWindow(m_display, m_window); + + if (!system->m_windowFocus) { + XLowerWindow(m_display, m_window); + } } GHOST_PRINT("Mapped window\n"); diff --git a/intern/guardedalloc/CMakeLists.txt b/intern/guardedalloc/CMakeLists.txt index 10ed4287185..3cec2fd1016 100644 --- a/intern/guardedalloc/CMakeLists.txt +++ b/intern/guardedalloc/CMakeLists.txt @@ -53,6 +53,11 @@ if(WIN32 AND NOT UNIX) ) endif() +# Jemalloc 5.0.0+ needs extra configuration. +if(WITH_MEM_JEMALLOC AND NOT ("${JEMALLOC_VERSION}" VERSION_LESS "5.0.0")) + add_definitions(-DWITH_JEMALLOC_CONF) +endif() + blender_add_lib(bf_intern_guardedalloc "${SRC}" "${INC}" "${INC_SYS}") # Override C++ alloc, optional. diff --git a/intern/guardedalloc/intern/mallocn.c b/intern/guardedalloc/intern/mallocn.c index a95cc9163c4..8c17da853e5 100644 --- a/intern/guardedalloc/intern/mallocn.c +++ b/intern/guardedalloc/intern/mallocn.c @@ -37,6 +37,13 @@ #include "mallocn_intern.h" +#ifdef WITH_JEMALLOC_CONF +/* If jemalloc is used, it reads this global variable and enables background + * threads to purge dirty pages. Otherwise we release memory too slowly or not + * at all if the thread that did the allocation stays inactive. */ +const char *malloc_conf = "background_thread:true,dirty_decay_ms:4000"; +#endif + size_t (*MEM_allocN_len)(const void *vmemh) = MEM_lockfree_allocN_len; void (*MEM_freeN)(void *vmemh) = MEM_lockfree_freeN; void *(*MEM_dupallocN)(const void *vmemh) = MEM_lockfree_dupallocN; diff --git a/intern/locale/boost_locale_wrapper.cpp b/intern/locale/boost_locale_wrapper.cpp index 0707c0dd3e3..3fd8f146aa3 100644 --- a/intern/locale/boost_locale_wrapper.cpp +++ b/intern/locale/boost_locale_wrapper.cpp @@ -112,13 +112,17 @@ const char *bl_locale_pgettext(const char *msgctxt, const char *msgid) return r; return msgid; } - catch(std::bad_cast const &e) { /* if std::has_facet<char_message_facet>(l) == false, LC_ALL = "C" case */ -// std::cout << "bl_locale_pgettext(" << msgid << "): " << e.what() << " \n"; + catch(const std::bad_cast &e) { /* if std::has_facet<char_message_facet>(l) == false, LC_ALL = "C" case */ +#ifndef NDEBUG + std::cout << "bl_locale_pgettext(" << msgid << "): " << e.what() << " \n"; +#endif (void)e; return msgid; } - catch(std::exception const &e) { -// std::cout << "bl_locale_pgettext(" << msgctxt << ", " << msgid << "): " << e.what() << " \n"; + catch(const std::exception &e) { +#ifndef NDEBUG + std::cout << "bl_locale_pgettext(" << msgctxt << ", " << msgid << "): " << e.what() << " \n"; +#endif (void)e; return msgid; } diff --git a/intern/smoke/intern/FLUID_3D.cpp b/intern/smoke/intern/FLUID_3D.cpp index 8a27818ff36..fd0a7e2005f 100644 --- a/intern/smoke/intern/FLUID_3D.cpp +++ b/intern/smoke/intern/FLUID_3D.cpp @@ -38,7 +38,7 @@ #if PARALLEL==1 #include <omp.h> -#endif // PARALLEL +#endif // PARALLEL ////////////////////////////////////////////////////////////////////// // Construction/Destruction @@ -51,13 +51,13 @@ FLUID_3D::FLUID_3D(int *res, float dx, float dtdef, int init_heat, int init_fire _dt = dtdef; // just in case. set in step from a RNA factor _iterations = 100; - _tempAmb = 0; + _tempAmb = 0; _heatDiffusion = 1e-3; _totalTime = 0.0f; _totalSteps = 0; _res = Vec3Int(_xRes,_yRes,_zRes); _maxRes = MAX3(_xRes, _yRes, _zRes); - + // initialize wavelet turbulence /* if(amplify) @@ -65,7 +65,7 @@ FLUID_3D::FLUID_3D(int *res, float dx, float dtdef, int init_heat, int init_fire else _wTurbulence = NULL; */ - + // scale the constants according to the refinement of the grid if (!dx) _dx = 1.0f / (float)_maxRes; @@ -218,7 +218,7 @@ void FLUID_3D::initColors(float init_r, float init_g, float init_b) void FLUID_3D::setBorderObstacles() { - + // set side obstacles unsigned int index; for (int y = 0; y < _yRes; y++) @@ -331,7 +331,7 @@ void FLUID_3D::step(float dt, float gravity[3]) // If border rules have been changed if (_colloPrev != *_borderColli) { printf("Border collisions changed\n"); - + // DG TODO: Need to check that no animated obstacle flags are overwritten setBorderCollisions(); } @@ -490,7 +490,7 @@ void FLUID_3D::step(float dt, float gravity[3]) for (int i=1; i<stepParts; i++) { int zPos=(int)((float)i*partSize + 0.5f); - + artificialDampingExactSL(zPos); } @@ -620,7 +620,7 @@ void FLUID_3D::artificialDampingSL(int zBegin, int zEnd) { void FLUID_3D::artificialDampingExactSL(int pos) { const float w = 0.9; int index, x,y,z; - + size_t posslab; @@ -650,7 +650,7 @@ void FLUID_3D::artificialDampingExactSL(int pos) { _zVelocityTemp[index+1] + _zVelocityTemp[index-1] + _zVelocityTemp[index+_res[0]] + _zVelocityTemp[index-_res[0]] + _zVelocityTemp[index+_slabSize] + _zVelocityTemp[index-_slabSize] ); - + } } @@ -677,7 +677,7 @@ void FLUID_3D::artificialDampingExactSL(int pos) { _zVelocityTemp[index+1] + _zVelocityTemp[index-1] + _zVelocityTemp[index+_res[0]] + _zVelocityTemp[index-_res[0]] + _zVelocityTemp[index+_slabSize] + _zVelocityTemp[index-_slabSize] ); - + } } @@ -759,7 +759,7 @@ void FLUID_3D::wipeBoundaries(int zBegin, int zEnd) void FLUID_3D::wipeBoundariesSL(int zBegin, int zEnd) { - + ///////////////////////////////////// // setZeroBorder to all: ///////////////////////////////////// @@ -933,16 +933,16 @@ void FLUID_3D::project() memset(_pressure, 0, sizeof(float)*_totalCells); memset(_divergence, 0, sizeof(float)*_totalCells); - + // set velocity and pressure inside of obstacles to zero setObstacleBoundaries(_pressure, 0, _zRes); - + // copy out the boundaries if(!_domainBcLeft) setNeumannX(_xVelocity, _res, 0, _zRes); - else setZeroX(_xVelocity, _res, 0, _zRes); + else setZeroX(_xVelocity, _res, 0, _zRes); if(!_domainBcFront) setNeumannY(_yVelocity, _res, 0, _zRes); - else setZeroY(_yVelocity, _res, 0, _zRes); + else setZeroY(_yVelocity, _res, 0, _zRes); if(!_domainBcTop) setNeumannZ(_zVelocity, _res, 0, _zRes); else setZeroZ(_zVelocity, _res, 0, _zRes); @@ -953,13 +953,13 @@ void FLUID_3D::project() for (y = 1; y < _yRes - 1; y++, index += 2) for (x = 1; x < _xRes - 1; x++, index++) { - + if(_obstacles[index]) { _divergence[index] = 0.0f; continue; } - + float xright = _xVelocity[index + 1]; float xleft = _xVelocity[index - 1]; @@ -1058,7 +1058,7 @@ void FLUID_3D::project() ////////////////////////////////////////////////////////////////////// void FLUID_3D::setObstacleVelocity(int zBegin, int zEnd) { - + // completely TODO <-- who wrote this and what is here TODO? DG const size_t index_ = _slabSize + _xRes + 1; @@ -1082,7 +1082,7 @@ void FLUID_3D::setObstacleVelocity(int zBegin, int zEnd) { if (!_obstacles[index]) { - // if(_obstacles[index+1]) xright = - _xVelocityOb[index]; + // if(_obstacles[index+1]) xright = - _xVelocityOb[index]; if((_obstacles[index - 1] & 8) && abs(_xVelocityOb[index - 1]) > FLT_EPSILON ) { // printf("velocity x!\n"); @@ -1221,7 +1221,7 @@ void FLUID_3D::setObstaclePressure(float *_pressure, int zBegin, int zEnd) _pressure[index] += _pressure[index + _slabSize]; pcnt += 1.0f; } - + if(pcnt > 0.000001f) _pressure[index] /= pcnt; @@ -1254,7 +1254,7 @@ void FLUID_3D::setObstacleBoundaries(float *_pressure, int zBegin, int zEnd) for (int z = zBegin + bb; z < zEnd - bt; z++) { size_t index = index_ +(z-1)*_slabSize; - + for (int y = 1; y < _yRes - 1; y++, index += 2) { for (int x = 1; x < _xRes - 1; x++, index++) @@ -1563,7 +1563,7 @@ void FLUID_3D::addVorticity(int zBegin, int zEnd) // calculate normalized vorticity vectors float eps = _vorticityEps; - + //index = _slabSize + _xRes + 1; vIndex=_slabSize + _xRes + 1; @@ -1618,7 +1618,7 @@ void FLUID_3D::addVorticity(int zBegin, int zEnd) } // y loop //vIndex+=2*_xRes; } // z loop - + if (_xVorticity) delete[] _xVorticity; if (_yVorticity) delete[] _yVorticity; if (_zVorticity) delete[] _zVorticity; @@ -1704,10 +1704,10 @@ void FLUID_3D::advectMacCormackEnd2(int zBegin, int zEnd) /* set boundary conditions for velocity */ if(!_domainBcLeft) copyBorderX(_xVelocityTemp, res, zBegin, zEnd); - else setZeroX(_xVelocityTemp, res, zBegin, zEnd); + else setZeroX(_xVelocityTemp, res, zBegin, zEnd); if(!_domainBcFront) copyBorderY(_yVelocityTemp, res, zBegin, zEnd); - else setZeroY(_yVelocityTemp, res, zBegin, zEnd); + else setZeroY(_yVelocityTemp, res, zBegin, zEnd); if(!_domainBcTop) copyBorderZ(_zVelocityTemp, res, zBegin, zEnd); else setZeroZ(_zVelocityTemp, res, zBegin, zEnd); @@ -1778,9 +1778,9 @@ void FLUID_3D::updateFlame(float *react, float *flame, int total_cells) for (int index = 0; index < total_cells; index++) { /* model flame temperature curve from the reaction coordinate (fuel) - * TODO: Would probably be best to get rid of whole "flame" data field. - * Currently it's just sqrt mirror of reaction coordinate, and therefore - * basically just waste of memory and disk space... + * TODO: Would probably be best to get rid of whole "flame" data field. + * Currently it's just sqrt mirror of reaction coordinate, and therefore + * basically just waste of memory and disk space... */ if (react[index]>0.0f) { /* do a smooth falloff for rest of the values */ |