diff options
Diffstat (limited to 'intern')
138 files changed, 2090 insertions, 1351 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 1500743763b..8854170c642 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -226,6 +226,9 @@ add_definitions( -DCCL_NAMESPACE_END=} ) +if(WITH_CYCLES_DEBUG) + add_definitions(-DWITH_CYCLES_DEBUG) +endif() if(WITH_CYCLES_STANDALONE_GUI) add_definitions(-DWITH_CYCLES_STANDALONE_GUI) endif() @@ -334,7 +337,7 @@ else() endif() # Warnings -if(CMAKE_COMPILER_IS_GNUCXX) +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_C_COMPILER_ID MATCHES "Clang") ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_float_conversion "-Werror=float-conversion") ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_double_promotion "-Werror=double-promotion") ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_no_error_unused_macros "-Wno-error=unused-macros") diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index 149967ad331..f0540486656 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -138,11 +138,6 @@ endif() blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}") -# avoid link failure with clang 3.4 debug -if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS '3.4') - string(APPEND CMAKE_CXX_FLAGS_DEBUG " -gline-tables-only") -endif() - add_dependencies(bf_intern_cycles bf_rna) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${ADDON_FILES}" ${CYCLES_INSTALL_PATH}) diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 19580914f19..a478dc22cd0 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -218,6 +218,12 @@ enum_denoising_prefilter = ( ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3), ) +enum_direct_light_sampling_type = ( + ('MULTIPLE_IMPORTANCE_SAMPLING', "Multiple Importance Sampling", "Multiple importance sampling is used to combine direct light contributions from next-event estimation and forward path tracing", 0), + ('FORWARD_PATH_TRACING', "Forward Path Tracing", "Direct light contributions are only sampled using forward path tracing", 1), + ('NEXT_EVENT_ESTIMATION', "Next-Event Estimation", "Direct light contributions are only sampled using next-event estimation", 2), +) + def update_render_passes(self, context): scene = context.scene view_layer = context.view_layer @@ -325,6 +331,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=1024, ) + sample_offset: IntProperty( + name="Sample Offset", + description="Number of samples to skip when starting render", + min=0, max=(1 << 24), + default=0, + ) + time_limit: FloatProperty( name="Time Limit", description="Limit the render time (excluding synchronization time)." @@ -415,6 +428,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=0, ) + direct_light_sampling_type: EnumProperty( + name="Direct Light Sampling Type", + description="The type of strategy used for sampling direct light contributions", + items=enum_direct_light_sampling_type, + default='MULTIPLE_IMPORTANCE_SAMPLING', + ) + min_light_bounces: IntProperty( name="Min Light Bounces", description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, " diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 397823103c8..57741447608 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -290,6 +290,9 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel): col.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling) col.prop(cscene, "sampling_pattern", text="Pattern") + col = layout.column(align=True) + col.prop(cscene, "sample_offset") + layout.separator() heading = layout.column(align=True, heading="Scrambling Distance") diff --git a/intern/cycles/blender/curves.cpp b/intern/cycles/blender/curves.cpp index ffe0c553738..c96d01a8ffb 100644 --- a/intern/cycles/blender/curves.cpp +++ b/intern/cycles/blender/curves.cpp @@ -199,7 +199,7 @@ static bool ObtainCacheParticleUV(Hair *hair, b_mesh->uv_layers.begin(l); float2 uv = zero_float2(); - if (b_mesh->uv_layers.length()) + if (!b_mesh->uv_layers.empty()) b_psys.uv_on_emitter(psmd, *b_pa, pa_no, uv_num, &uv.x); CData->curve_uv.push_back_slow(uv); @@ -261,7 +261,7 @@ static bool ObtainCacheParticleVcol(Hair *hair, b_mesh->vertex_colors.begin(l); float4 vcol = make_float4(0.0f, 0.0f, 0.0f, 1.0f); - if (b_mesh->vertex_colors.length()) + if (!b_mesh->vertex_colors.empty()) b_psys.mcol_on_emitter(psmd, *b_pa, pa_no, vcol_num, &vcol.x); CData->curve_vcol.push_back_slow(vcol); diff --git a/intern/cycles/blender/display_driver.cpp b/intern/cycles/blender/display_driver.cpp index d5f6d85251e..abf421983b3 100644 --- a/intern/cycles/blender/display_driver.cpp +++ b/intern/cycles/blender/display_driver.cpp @@ -334,7 +334,7 @@ bool BlenderDisplayDriver::update_begin(const Params ¶ms, /* Update PBO dimensions if needed. * - * NOTE: Allocate the PBO for the the size which will fit the final render resolution (as in, + * NOTE: Allocate the PBO for the size which will fit the final render resolution (as in, * at a resolution divider 1. This was we don't need to recreate graphics interoperability * objects which are costly and which are tied to the specific underlying buffer size. * The downside of this approach is that when graphics interoperability is not used we are diff --git a/intern/cycles/blender/mesh.cpp b/intern/cycles/blender/mesh.cpp index b69bf88c213..bb17cfdcb45 100644 --- a/intern/cycles/blender/mesh.cpp +++ b/intern/cycles/blender/mesh.cpp @@ -555,7 +555,7 @@ static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, /* Create uv map attributes. */ static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh) { - if (b_mesh.uv_layers.length() != 0) { + if (!b_mesh.uv_layers.empty()) { for (BL::MeshUVLoopLayer &l : b_mesh.uv_layers) { const bool active_render = l.active_render(); AttributeStandard uv_std = (active_render) ? ATTR_STD_UV : ATTR_STD_NONE; @@ -619,7 +619,7 @@ static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh) static void attr_create_subd_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivide_uvs) { - if (b_mesh.uv_layers.length() != 0) { + if (!b_mesh.uv_layers.empty()) { BL::Mesh::uv_layers_iterator l; int i = 0; @@ -951,7 +951,7 @@ static void create_mesh(Scene *scene, N = attr_N->data_float3(); /* create generated coordinates from undeformed coordinates */ - const bool need_default_tangent = (subdivision == false) && (b_mesh.uv_layers.length() == 0) && + const bool need_default_tangent = (subdivision == false) && (b_mesh.uv_layers.empty()) && (mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)); if (mesh->need_attribute(scene, ATTR_STD_GENERATED) || need_default_tangent) { Attribute *attr = attributes.add(ATTR_STD_GENERATED); diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp index 43625a2e479..949b7cb1b3b 100644 --- a/intern/cycles/blender/sync.cpp +++ b/intern/cycles/blender/sync.cpp @@ -392,6 +392,12 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background) integrator->set_ao_bounces(0); } +#ifdef WITH_CYCLES_DEBUG + DirectLightSamplingType direct_light_sampling_type = (DirectLightSamplingType)get_enum( + cscene, "direct_light_sampling_type", DIRECT_LIGHT_SAMPLING_NUM, DIRECT_LIGHT_SAMPLING_MIS); + integrator->set_direct_light_sampling_type(direct_light_sampling_type); +#endif + const DenoiseParams denoise_params = get_denoise_params(b_scene, b_view_layer, background); integrator->set_use_denoise(denoise_params.use); @@ -835,18 +841,25 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, /* samples */ int samples = get_int(cscene, "samples"); int preview_samples = get_int(cscene, "preview_samples"); + int sample_offset = get_int(cscene, "sample_offset"); if (background) { params.samples = samples; + params.sample_offset = sample_offset; } else { params.samples = preview_samples; - if (params.samples == 0) + if (params.samples == 0) { params.samples = INT_MAX; + } + params.sample_offset = 0; } + /* Clamp sample offset. */ + params.sample_offset = clamp(params.sample_offset, 0, Integrator::MAX_SAMPLES); + /* Clamp samples. */ - params.samples = min(params.samples, Integrator::MAX_SAMPLES); + params.samples = clamp(params.samples, 0, Integrator::MAX_SAMPLES - params.sample_offset); /* Viewport Performance */ params.pixel_size = b_engine.get_preview_pixel_size(b_scene); @@ -865,7 +878,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, /* Time limit. */ if (background) { - params.time_limit = get_float(cscene, "time_limit"); + params.time_limit = (double)get_float(cscene, "time_limit"); } else { /* For the viewport it kind of makes more sense to think in terms of the noise floor, which is diff --git a/intern/cycles/blender/util.h b/intern/cycles/blender/util.h index 33fd2c416c8..be36bcdaaa8 100644 --- a/intern/cycles/blender/util.h +++ b/intern/cycles/blender/util.h @@ -303,7 +303,7 @@ static inline string image_user_file_path(BL::ImageUser &iuser, string filepath_str = string(filepath); if (load_tiled && ima.source() == BL::Image::source_TILED) { string udim; - if (ima.tiles.length() > 0) { + if (!ima.tiles.empty()) { udim = to_string(ima.tiles[0].number()); } string_replace(filepath_str, udim, "<UDIM>"); @@ -647,7 +647,7 @@ static inline Mesh::SubdivisionType object_subdivision_type(BL::Object &b_ob, { PointerRNA cobj = RNA_pointer_get(&b_ob.ptr, "cycles"); - if (cobj.data && b_ob.modifiers.length() > 0 && experimental) { + if (cobj.data && !b_ob.modifiers.empty() && experimental) { BL::Modifier mod = b_ob.modifiers[b_ob.modifiers.length() - 1]; bool enabled = preview ? mod.show_viewport() : mod.show_render(); diff --git a/intern/cycles/bvh/embree.cpp b/intern/cycles/bvh/embree.cpp index 944a84ce0da..b54b38f2798 100644 --- a/intern/cycles/bvh/embree.cpp +++ b/intern/cycles/bvh/embree.cpp @@ -303,7 +303,7 @@ static void rtc_error_func(void *, enum RTCError, const char *str) VLOG(1) << str; } -static double progress_start_time = 0.0f; +static double progress_start_time = 0.0; static bool rtc_progress_func(void *user_ptr, const double n) { diff --git a/intern/cycles/bvh/node.cpp b/intern/cycles/bvh/node.cpp index d3a665adfe7..60b0843bde2 100644 --- a/intern/cycles/bvh/node.cpp +++ b/intern/cycles/bvh/node.cpp @@ -153,7 +153,7 @@ void BVHNode::update_time() namespace { struct DumpTraversalContext { - /* Descriptor of wile where writing is happening. */ + /* Descriptor of while where writing is happening. */ FILE *stream; /* Unique identifier of the node current. */ int id; diff --git a/intern/cycles/bvh/node.h b/intern/cycles/bvh/node.h index d5de9e062fc..1fd9efdb75e 100644 --- a/intern/cycles/bvh/node.h +++ b/intern/cycles/bvh/node.h @@ -178,7 +178,7 @@ class InnerNode : public BVHNode { reset_unused_children(); } - /* NOTE: This function is only used during binary BVH builder, and it + /* NOTE: This function is only used during binary BVH builder, and it's * supposed to be configured to have 2 children which will be filled-in in a * bit. But this is important to have children reset to NULL. */ explicit InnerNode(const BoundBox &bounds) : BVHNode(bounds), num_children_(0) diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake index a470fb9c574..957b702fd3a 100644 --- a/intern/cycles/cmake/macros.cmake +++ b/intern/cycles/cmake/macros.cmake @@ -88,7 +88,7 @@ endmacro() function(cycles_link_directories) if(APPLE) - # APPLE plaform uses full paths for linking libraries, and avoids link_directories. + # APPLE platform uses full paths for linking libraries, and avoids link_directories. return() endif() diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp index f11b49ef65f..5aabed8702a 100644 --- a/intern/cycles/device/cpu/device.cpp +++ b/intern/cycles/device/cpu/device.cpp @@ -38,7 +38,6 @@ void device_cpu_info(vector<DeviceInfo> &devices) info.id = "CPU"; info.num = 0; info.has_osl = true; - info.has_half_images = true; info.has_nanovdb = true; info.has_profiling = true; if (openimagedenoise_supported()) { diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp index 68dec7f0af2..2ad76de70ca 100644 --- a/intern/cycles/device/cpu/device_impl.cpp +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -93,11 +93,6 @@ CPUDevice::~CPUDevice() texture_info.free(); } -bool CPUDevice::show_samples() const -{ - return (info.cpu_threads == 1); -} - BVHLayoutMask CPUDevice::get_bvh_layout_mask() const { BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h index 90d217bb624..6f9452a6378 100644 --- a/intern/cycles/device/cpu/device_impl.h +++ b/intern/cycles/device/cpu/device_impl.h @@ -60,8 +60,6 @@ class CPUDevice : public Device { CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_); ~CPUDevice(); - virtual bool show_samples() const override; - virtual BVHLayoutMask get_bvh_layout_mask() const override; /* Returns true if the texture info was copied to the device (meaning, some more diff --git a/intern/cycles/device/cuda/device.cpp b/intern/cycles/device/cuda/device.cpp index af2bdc6e29c..0d9e6c72466 100644 --- a/intern/cycles/device/cuda/device.cpp +++ b/intern/cycles/device/cuda/device.cpp @@ -144,7 +144,6 @@ void device_cuda_info(vector<DeviceInfo> &devices) info.description = string(name); info.num = num; - info.has_half_images = (major >= 3); info.has_nanovdb = true; info.denoisers = 0; diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 20945796a2d..e05fef3897c 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -46,12 +46,6 @@ bool CUDADevice::have_precompiled_kernels() return path_exists(cubins_path); } -bool CUDADevice::show_samples() const -{ - /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ - return true; -} - BVHLayoutMask CUDADevice::get_bvh_layout_mask() const { return BVH_LAYOUT_BVH2; @@ -242,6 +236,10 @@ string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features) cflags += " -DWITH_NANOVDB"; # endif +# ifdef WITH_CYCLES_DEBUG + cflags += " -DWITH_CYCLES_DEBUG"; +# endif + return cflags; } @@ -932,7 +930,6 @@ void CUDADevice::tex_alloc(device_texture &mem) { CUDAContextScope scope(this); - /* General variables for both architectures */ string bind_name = mem.name; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); @@ -1095,7 +1092,6 @@ void CUDADevice::tex_alloc(device_texture &mem) if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { - /* Kepler+, bindless textures. */ CUDA_RESOURCE_DESC resDesc; memset(&resDesc, 0, sizeof(resDesc)); diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h index 72d4108d1bf..4c357d0b5ab 100644 --- a/intern/cycles/device/cuda/device_impl.h +++ b/intern/cycles/device/cuda/device_impl.h @@ -76,8 +76,6 @@ class CUDADevice : public Device { static bool have_precompiled_kernels(); - virtual bool show_samples() const override; - virtual BVHLayoutMask get_bvh_layout_mask() const override; void set_error(const string &error) override; diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 63d0a49d3eb..bfbcdb20d5e 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -286,7 +286,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.description = "Multi Device"; info.num = 0; - info.has_half_images = true; info.has_nanovdb = true; info.has_osl = true; info.has_profiling = true; @@ -333,7 +332,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, } /* Accumulate device info. */ - info.has_half_images &= device.has_half_images; info.has_nanovdb &= device.has_nanovdb; info.has_osl &= device.has_osl; info.has_profiling &= device.has_profiling; diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 65188459c2c..346632de314 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -73,7 +73,6 @@ class DeviceInfo { int num; bool display_device; /* GPU is used as a display device. */ bool has_nanovdb; /* Support NanoVDB volumes. */ - bool has_half_images; /* Support half-float textures. */ bool has_osl; /* Support Open Shading Language. */ bool has_profiling; /* Supports runtime collection of profiling info. */ bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ @@ -90,7 +89,6 @@ class DeviceInfo { num = 0; cpu_threads = 0; display_device = false; - has_half_images = false; has_nanovdb = false; has_osl = false; has_profiling = false; @@ -151,10 +149,6 @@ class Device { fprintf(stderr, "%s\n", error.c_str()); fflush(stderr); } - virtual bool show_samples() const - { - return false; - } virtual BVHLayoutMask get_bvh_layout_mask() const = 0; /* statistics */ diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp index ecc109b2bb9..a9c7b1ba841 100644 --- a/intern/cycles/device/hip/device.cpp +++ b/intern/cycles/device/hip/device.cpp @@ -148,7 +148,6 @@ void device_hip_info(vector<DeviceInfo> &devices) info.description = string(name); info.num = num; - info.has_half_images = true; info.has_nanovdb = true; info.denoisers = 0; diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index 42dd5382ae5..53c4f3f0b3f 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -47,12 +47,6 @@ bool HIPDevice::have_precompiled_kernels() return path_exists(fatbins_path); } -bool HIPDevice::show_samples() const -{ - /* The HIPDevice only processes one tile at a time, so showing samples is fine. */ - return true; -} - BVHLayoutMask HIPDevice::get_bvh_layout_mask() const { return BVH_LAYOUT_BVH2; @@ -243,7 +237,7 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c hipGetDeviceProperties(&props, hipDevId); /* gcnArchName can contain tokens after the arch name with features, ie. - "gfx1010:sramecc-:xnack-" so we tokenize it to get the first part. */ + * `gfx1010:sramecc-:xnack-` so we tokenize it to get the first part. */ char *arch = strtok(props.gcnArchName, ":"); if (arch == NULL) { arch = props.gcnArchName; @@ -374,10 +368,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c bool HIPDevice::load_kernels(const uint kernel_features) { - /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile. + /* TODO(sergey): Support kernels re-load for HIP devices adaptive compile. * - * Currently re-loading kernel will invalidate memory pointers, - * causing problems in cuCtxSynchronize. + * Currently re-loading kernels will invalidate memory pointers. */ if (hipModule) { if (use_adaptive_compilation()) { @@ -899,7 +892,6 @@ void HIPDevice::tex_alloc(device_texture &mem) { HIPContextScope scope(this); - /* General variables for both architectures */ string bind_name = mem.name; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h index eb832ad828c..08a7be57e9c 100644 --- a/intern/cycles/device/hip/device_impl.h +++ b/intern/cycles/device/hip/device_impl.h @@ -75,8 +75,6 @@ class HIPDevice : public Device { static bool have_precompiled_kernels(); - virtual bool show_samples() const override; - virtual BVHLayoutMask get_bvh_layout_mask() const override; void set_error(const string &error) override; @@ -93,9 +91,7 @@ class HIPDevice : public Device { virtual string compile_kernel_get_common_cflags(const uint kernel_features); - string compile_kernel(const uint kernel_features, - const char *name, - const char *base = "hip"); + string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip"); virtual bool load_kernels(const uint kernel_features) override; void reserve_local_memory(const uint kernel_features); diff --git a/intern/cycles/device/hip/graphics_interop.h b/intern/cycles/device/hip/graphics_interop.h index 8314405e670..71c6893edbd 100644 --- a/intern/cycles/device/hip/graphics_interop.h +++ b/intern/cycles/device/hip/graphics_interop.h @@ -48,7 +48,7 @@ class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop { HIPDeviceQueue *queue_ = nullptr; HIPDevice *device_ = nullptr; - /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */ + /* OpenGL PBO which is currently registered as the destination for the HIP buffer. */ uint opengl_pbo_id_ = 0; /* Buffer area in pixels of the corresponding PBO. */ int64_t buffer_area_ = 0; diff --git a/intern/cycles/device/memory.cpp b/intern/cycles/device/memory.cpp index 86bf2542c92..ba2d993fb9e 100644 --- a/intern/cycles/device/memory.cpp +++ b/intern/cycles/device/memory.cpp @@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN device_memory::device_memory(Device *device, const char *name, MemoryType type) : data_type(device_type_traits<uchar>::data_type), - data_elements(device_type_traits<uchar>::num_elements_cpu), + data_elements(device_type_traits<uchar>::num_elements), data_size(0), device_size(0), data_width(0), diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h index e04142117aa..cb22c191656 100644 --- a/intern/cycles/device/memory.h +++ b/intern/cycles/device/memory.h @@ -81,155 +81,140 @@ static constexpr size_t datatype_size(DataType datatype) template<typename T> struct device_type_traits { static const DataType data_type = TYPE_UNKNOWN; - static const size_t num_elements_cpu = sizeof(T); - static const size_t num_elements_gpu = sizeof(T); + static const size_t num_elements = sizeof(T); }; template<> struct device_type_traits<uchar> { static const DataType data_type = TYPE_UCHAR; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(uchar) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<uchar2> { static const DataType data_type = TYPE_UCHAR; - static const size_t num_elements_cpu = 2; - static const size_t num_elements_gpu = 2; - static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 2; + static_assert(sizeof(uchar2) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<uchar3> { static const DataType data_type = TYPE_UCHAR; - static const size_t num_elements_cpu = 3; - static const size_t num_elements_gpu = 3; - static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 3; + static_assert(sizeof(uchar3) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<uchar4> { static const DataType data_type = TYPE_UCHAR; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(uchar4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<uint> { static const DataType data_type = TYPE_UINT; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(uint) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<uint2> { static const DataType data_type = TYPE_UINT; - static const size_t num_elements_cpu = 2; - static const size_t num_elements_gpu = 2; - static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 2; + static_assert(sizeof(uint2) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<uint3> { static const DataType data_type = TYPE_UINT; - static const size_t num_elements_cpu = 3; - static const size_t num_elements_gpu = 3; - static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 3; + static_assert(sizeof(uint3) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<uint4> { static const DataType data_type = TYPE_UINT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(uint4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<int> { static const DataType data_type = TYPE_INT; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(int) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<int2> { static const DataType data_type = TYPE_INT; - static const size_t num_elements_cpu = 2; - static const size_t num_elements_gpu = 2; - static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 2; + static_assert(sizeof(int2) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<int3> { static const DataType data_type = TYPE_INT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 3; - static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(int3) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<int4> { static const DataType data_type = TYPE_INT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(int4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<float> { static const DataType data_type = TYPE_FLOAT; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(float) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<float2> { static const DataType data_type = TYPE_FLOAT; - static const size_t num_elements_cpu = 2; - static const size_t num_elements_gpu = 2; - static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 2; + static_assert(sizeof(float2) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<float3> { + /* float3 has different size depending on the device, can't use it for interchanging + * memory between CPU and GPU. + * + * Leave body empty to trigger a compile error if used. */ +}; + +template<> struct device_type_traits<packed_float3> { static const DataType data_type = TYPE_FLOAT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 3; - static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 3; + static_assert(sizeof(packed_float3) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<float4> { static const DataType data_type = TYPE_FLOAT; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(float4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<half> { static const DataType data_type = TYPE_HALF; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(half) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<ushort4> { static const DataType data_type = TYPE_UINT16; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(ushort4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<uint16_t> { static const DataType data_type = TYPE_UINT16; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(uint16_t) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<half4> { static const DataType data_type = TYPE_HALF; - static const size_t num_elements_cpu = 4; - static const size_t num_elements_gpu = 4; - static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 4; + static_assert(sizeof(half4) == num_elements * datatype_size(data_type)); }; template<> struct device_type_traits<uint64_t> { static const DataType data_type = TYPE_UINT64; - static const size_t num_elements_cpu = 1; - static const size_t num_elements_gpu = 1; - static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type)); + static const size_t num_elements = 1; + static_assert(sizeof(uint64_t) == num_elements * datatype_size(data_type)); }; /* Device Memory @@ -325,9 +310,7 @@ template<typename T> class device_only_memory : public device_memory { : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY) { data_type = device_type_traits<T>::data_type; - data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu : - device_type_traits<T>::num_elements_gpu, - 1); + data_elements = max(device_type_traits<T>::num_elements, 1); } device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other)) @@ -383,15 +366,11 @@ template<typename T> class device_only_memory : public device_memory { template<typename T> class device_vector : public device_memory { public: - /* Can only use this for types that have the same size on CPU and GPU. */ - static_assert(device_type_traits<T>::num_elements_cpu == - device_type_traits<T>::num_elements_gpu); - device_vector(Device *device, const char *name, MemoryType type) : device_memory(device, name, type) { data_type = device_type_traits<T>::data_type; - data_elements = device_type_traits<T>::num_elements_cpu; + data_elements = device_type_traits<T>::num_elements; modified = true; need_realloc_ = true; diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp index 56efec3e131..e319246d4f4 100644 --- a/intern/cycles/device/multi/device.cpp +++ b/intern/cycles/device/multi/device.cpp @@ -109,14 +109,6 @@ class MultiDevice : public Device { return error_msg; } - virtual bool show_samples() const override - { - if (devices.size() > 1) { - return false; - } - return devices.front().device->show_samples(); - } - virtual BVHLayoutMask get_bvh_layout_mask() const override { BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL; diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index 6e897e3831f..b82b1281eb8 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -208,11 +208,15 @@ bool OptiXDevice::load_kernels(const uint kernel_features) } else { module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3; - module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; + module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; } module_options.boundValues = nullptr; module_options.numBoundValues = 0; +# if OPTIX_ABI_VERSION >= 55 + module_options.payloadTypes = nullptr; + module_options.numPayloadTypes = 0; +# endif OptixPipelineCompileOptions pipeline_options = {}; /* Default to no motion blur and two-level graph, since it is the fastest option. */ @@ -227,7 +231,11 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; if (kernel_features & KERNEL_FEATURE_HAIR) { if (kernel_features & KERNEL_FEATURE_HAIR_THICK) { +# if OPTIX_ABI_VERSION >= 55 + pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM; +# else pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE; +# endif } else pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM; @@ -324,7 +332,13 @@ bool OptiXDevice::load_kernels(const uint kernel_features) if (kernel_features & KERNEL_FEATURE_HAIR_THICK) { /* Built-in thick curve intersection. */ OptixBuiltinISOptions builtin_options = {}; +# if OPTIX_ABI_VERSION >= 55 + builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM; + builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE; + builtin_options.curveEndcapFlags = OPTIX_CURVE_ENDCAP_DEFAULT; /* Disable endcaps. */ +# else builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; +# endif builtin_options.usesMotionBlur = false; optix_assert(optixBuiltinISModuleGet( @@ -411,7 +425,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL; } else { - link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; + link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE; } if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { @@ -1178,6 +1192,15 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) int ka = max(k0 - 1, curve.first_key); int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1); + index_data[i] = i * 4; + float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; + +# if OPTIX_ABI_VERSION >= 55 + v[0] = make_float4(keys[ka].x, keys[ka].y, keys[ka].z, curve_radius[ka]); + v[1] = make_float4(keys[k0].x, keys[k0].y, keys[k0].z, curve_radius[k0]); + v[2] = make_float4(keys[k1].x, keys[k1].y, keys[k1].z, curve_radius[k1]); + v[3] = make_float4(keys[kb].x, keys[kb].y, keys[kb].z, curve_radius[kb]); +# else const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x); const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y); const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z); @@ -1190,8 +1213,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f; static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f; - index_data[i] = i * 4; - float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; v[0] = make_float4( dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw)); v[1] = make_float4( @@ -1200,6 +1221,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw)); v[3] = make_float4( dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw)); +# endif } else { BoundBox bounds = BoundBox::empty; @@ -1241,7 +1263,11 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) OptixBuildInput build_input = {}; if (hair->curve_shape == CURVE_THICK) { build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES; +# if OPTIX_ABI_VERSION >= 55 + build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM; +# else build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; +# endif build_input.curveArray.numPrimitives = num_segments; build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); build_input.curveArray.numVertices = num_vertices; @@ -1422,9 +1448,12 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) instance.sbtOffset = PG_HITD_MOTION - PG_HITD; } } - else { - /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves, - * since it needs to filter out end-caps there). +# if OPTIX_ABI_VERSION < 55 + /* Cannot disable any-hit program for thick curves, since it needs to filter out endcaps. */ + else +# endif + { + /* Can disable __anyhit__kernel_optix_visibility_test by default. * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT. */ @@ -1494,9 +1523,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size); delete[] reinterpret_cast<uint8_t *>(&motion_transform); - /* Disable instance transform if object uses motion transform already. */ - instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - /* Get traversable handle to motion transform. */ optixConvertPointerToTraversableHandle(context, motion_transform_gpu, @@ -1510,10 +1536,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) /* Set transform matrix. */ memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform)); } - else { - /* Disable instance transform if geometry already has it applied to vertex data. */ - instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; - } } } diff --git a/intern/cycles/doc/license/readme.txt b/intern/cycles/doc/license/readme.txt index cc5476e3511..f610d800ebc 100644 --- a/intern/cycles/doc/license/readme.txt +++ b/intern/cycles/doc/license/readme.txt @@ -3,7 +3,7 @@ This program uses code from various sources, the default license is Apache 2.0 for all code, with the following exceptions. Modified BSD License -* Code adapated from Open Shading Language +* Code adapted from Open Shading Language * Sobol direction vectors * Matrix inversion code from OpenEXR * MD5 Hash code diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 92bf8e69d19..ec90681b78a 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -380,7 +380,10 @@ void PathTrace::path_trace(RenderWork &render_work) PathTraceWork *path_trace_work = path_trace_works_[i].get(); PathTraceWork::RenderStatistics statistics; - path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples); + path_trace_work->render_samples(statistics, + render_work.path_trace.start_sample, + num_samples, + render_work.path_trace.sample_offset); const double work_time = time_dt() - work_start_time; work_balance_infos_[i].time_spent += work_time; @@ -850,7 +853,8 @@ void PathTrace::progress_update_if_needed(const RenderWork &render_work) const uint64_t num_samples_added = uint64_t(tile_size.x) * tile_size.y * render_work.path_trace.num_samples; const int current_sample = render_work.path_trace.start_sample + - render_work.path_trace.num_samples; + render_work.path_trace.num_samples - + render_work.path_trace.sample_offset; progress_->add_samples(num_samples_added, current_sample); } diff --git a/intern/cycles/integrator/path_trace_display.h b/intern/cycles/integrator/path_trace_display.h index b69ee85fbbc..47014f43afa 100644 --- a/intern/cycles/integrator/path_trace_display.h +++ b/intern/cycles/integrator/path_trace_display.h @@ -76,7 +76,7 @@ class PathTraceDisplay { /* Copy buffer of rendered pixels of a given size into a given position of the texture. * - * This function does not acquire a lock. The reason for this is is to allow use of this function + * This function does not acquire a lock. The reason for this is to allow use of this function * for partial updates from different devices. In this case the caller will acquire the lock * once, update all the slices and release * the lock once. This will ensure that draw() will never use partially updated texture. */ diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h index 0dc7cd2f896..2ebfc913580 100644 --- a/intern/cycles/integrator/path_trace_work.h +++ b/intern/cycles/integrator/path_trace_work.h @@ -75,7 +75,10 @@ class PathTraceWork { /* Render given number of samples as a synchronous blocking call. * The samples are added to the render buffer associated with this work. */ - virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0; + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num, + int sample_offset) = 0; /* Copy render result from this work to the corresponding place of the GPU display. * diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp index 12dcc899dbb..2f6c3cf5aca 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.cpp +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -71,7 +71,8 @@ void PathTraceWorkCPU::init_execution() void PathTraceWorkCPU::render_samples(RenderStatistics &statistics, int start_sample, - int samples_num) + int samples_num, + int sample_offset) { const int64_t image_width = effective_buffer_params_.width; const int64_t image_height = effective_buffer_params_.height; @@ -99,6 +100,7 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics, work_tile.w = 1; work_tile.h = 1; work_tile.start_sample = start_sample; + work_tile.sample_offset = sample_offset; work_tile.num_samples = 1; work_tile.offset = effective_buffer_params_.offset; work_tile.stride = effective_buffer_params_.stride; diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h index 6e734690811..63ab686588c 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.h +++ b/intern/cycles/integrator/path_trace_work_cpu.h @@ -48,7 +48,8 @@ class PathTraceWorkCPU : public PathTraceWork { virtual void render_samples(RenderStatistics &statistics, int start_sample, - int samples_num) override; + int samples_num, + int sample_offset) override; virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index b9784f68f56..956aa6a8c90 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -250,7 +250,8 @@ void PathTraceWorkGPU::init_execution() void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, int start_sample, - int samples_num) + int samples_num, + int sample_offset) { /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to * add more work (because tiles are smaller, so there is higher chance that more paths will @@ -261,6 +262,7 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num, + sample_offset, device_scene_->data.integrator.scrambling_distance); enqueue_reset(); diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index c5e291e72db..5aa497c26e7 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -46,7 +46,8 @@ class PathTraceWorkGPU : public PathTraceWork { virtual void render_samples(RenderStatistics &statistics, int start_sample, - int samples_num) override; + int samples_num, + int sample_offset) override; virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp index 276453f7aec..971173a5e96 100644 --- a/intern/cycles/integrator/render_scheduler.cpp +++ b/intern/cycles/integrator/render_scheduler.cpp @@ -88,6 +88,16 @@ int RenderScheduler::get_num_samples() const return num_samples_; } +void RenderScheduler::set_sample_offset(int sample_offset) +{ + sample_offset_ = sample_offset; +} + +int RenderScheduler::get_sample_offset() const +{ + return sample_offset_; +} + void RenderScheduler::set_time_limit(double time_limit) { time_limit_ = time_limit; @@ -110,13 +120,15 @@ int RenderScheduler::get_num_rendered_samples() const return state_.num_rendered_samples; } -void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples) +void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples, int sample_offset) { buffer_params_ = buffer_params; update_start_resolution_divider(); set_num_samples(num_samples); + set_start_sample(sample_offset); + set_sample_offset(sample_offset); /* In background mode never do lower resolution render preview, as it is not really supported * by the software. */ @@ -171,7 +183,7 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples) void RenderScheduler::reset_for_next_tile() { - reset(buffer_params_, num_samples_); + reset(buffer_params_, num_samples_, sample_offset_); } bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work) @@ -317,6 +329,7 @@ RenderWork RenderScheduler::get_render_work() render_work.path_trace.start_sample = get_start_sample_to_path_trace(); render_work.path_trace.num_samples = get_num_samples_to_path_trace(); + render_work.path_trace.sample_offset = get_sample_offset(); render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample()); @@ -835,7 +848,7 @@ int RenderScheduler::get_num_samples_to_path_trace() const * When time limit is not used the number of samples per render iteration is either increasing * or stays the same, so there is no need to clamp number of samples calculated for occupancy. */ - if (time_limit_ && state_.start_render_time) { + if (time_limit_ != 0.0 && state_.start_render_time != 0.0) { const double remaining_render_time = max( 0.0, time_limit_ - (time_dt() - state_.start_render_time)); const double time_per_sample_average = path_trace_time_.get_average(); diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h index d7b7413ae31..28f563c46e3 100644 --- a/intern/cycles/integrator/render_scheduler.h +++ b/intern/cycles/integrator/render_scheduler.h @@ -39,6 +39,7 @@ class RenderWork { struct { int start_sample = 0; int num_samples = 0; + int sample_offset = 0; } path_trace; struct { @@ -125,6 +126,9 @@ class RenderScheduler { void set_num_samples(int num_samples); int get_num_samples() const; + void set_sample_offset(int sample_offset); + int get_sample_offset() const; + /* Time limit for the path tracing tasks, in minutes. * Zero disables the limit. */ void set_time_limit(double time_limit); @@ -150,7 +154,7 @@ class RenderScheduler { /* Reset scheduler, indicating that rendering will happen from scratch. * Resets current rendered state, as well as scheduling information. */ - void reset(const BufferParams &buffer_params, int num_samples); + void reset(const BufferParams &buffer_params, int num_samples, int sample_offset); /* Reset scheduler upon switching to a next tile. * Will keep the same number of samples and full-frame render parameters, but will reset progress @@ -419,6 +423,8 @@ class RenderScheduler { int start_sample_ = 0; int num_samples_ = 0; + int sample_offset_ = 0; + /* Limit in seconds for how long path tracing is allowed to happen. * Zero means no limit is applied. */ double time_limit_ = 0.0; diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp index 2d1ac07db7f..d60f7149bf4 100644 --- a/intern/cycles/integrator/work_tile_scheduler.cpp +++ b/intern/cycles/integrator/work_tile_scheduler.cpp @@ -36,6 +36,7 @@ void WorkTileScheduler::set_max_num_path_states(int max_num_path_states) void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num, + int sample_offset, float scrambling_distance) { /* Image buffer parameters. */ @@ -51,6 +52,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params, /* Samples parameters. */ sample_start_ = sample_start; samples_num_ = samples_num; + sample_offset_ = sample_offset; /* Initialize new scheduling. */ reset_scheduler_state(); @@ -111,6 +113,7 @@ bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_ work_tile.h = tile_size_.height; work_tile.start_sample = sample_start_ + start_sample; work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample); + work_tile.sample_offset = sample_offset_; work_tile.offset = offset_; work_tile.stride = stride_; diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h index d9fa7e84431..2d6395799f7 100644 --- a/intern/cycles/integrator/work_tile_scheduler.h +++ b/intern/cycles/integrator/work_tile_scheduler.h @@ -41,6 +41,7 @@ class WorkTileScheduler { void reset(const BufferParams &buffer_params, int sample_start, int samples_num, + int sample_offset, float scrambling_distance); /* Get work for a device. @@ -79,6 +80,7 @@ class WorkTileScheduler { * (splitting into a smaller work tiles). */ int sample_start_ = 0; int samples_num_ = 0; + int sample_offset_ = 0; /* Tile size which be scheduled for rendering. */ TileSize tile_size_; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 1a254f5eddc..d759399b04d 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -39,6 +39,10 @@ set(SRC_KERNEL_DEVICE_HIP device/hip/kernel.cpp ) +set(SRC_KERNEL_DEVICE_METAL + device/metal/kernel.metal +) + set(SRC_KERNEL_DEVICE_OPTIX device/optix/kernel.cu device/optix/kernel_shader_raytrace.cu @@ -79,6 +83,13 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS device/optix/globals.h ) +set(SRC_KERNEL_DEVICE_METAL_HEADERS + device/metal/compat.h + device/metal/context_begin.h + device/metal/context_end.h + device/metal/globals.h +) + set(SRC_KERNEL_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -262,6 +273,7 @@ set(SRC_KERNEL_UTIL_HEADERS ) set(SRC_KERNEL_TYPES_HEADERS + tables.h textures.h types.h ) @@ -399,12 +411,8 @@ if(WITH_CYCLES_CUDA_BINARIES) -I ${CMAKE_CURRENT_SOURCE_DIR}/.. -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda --use_fast_math - -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}) - - if(${experimental}) - set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__) - set(name ${name}_experimental) - endif() + -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file} + -Wno-deprecated-gpu-targets) if(WITH_NANOVDB) set(cuda_flags ${cuda_flags} @@ -412,6 +420,10 @@ if(WITH_CYCLES_CUDA_BINARIES) -I "${NANOVDB_INCLUDE_DIR}") endif() + if(WITH_CYCLES_DEBUG) + set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG) + endif() + if(WITH_CYCLES_CUBIN_COMPILER) string(SUBSTRING ${arch} 3 -1 CUDA_ARCH) @@ -560,11 +572,6 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP) -ffast-math -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file}) - if(${experimental}) - set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__) - set(name ${name}_experimental) - endif() - if(WITH_NANOVDB) set(hip_flags ${hip_flags} -D WITH_NANOVDB @@ -572,7 +579,7 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP) endif() if(WITH_CYCLES_DEBUG) - set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__) + set(hip_flags ${hip_flags} -D WITH_CYCLES_DEBUG) endif() add_custom_command( @@ -613,6 +620,10 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) -I "${NANOVDB_INCLUDE_DIR}") endif() + if(WITH_CYCLES_DEBUG) + set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG) + endif() + if(WITH_CYCLES_CUBIN_COMPILER) # Needed to find libnvrtc-builtins.so. Can't do it from inside # cycles_cubin_cc since the env variable is read before main() @@ -701,7 +712,7 @@ if(WITH_COMPILER_ASAN) string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=all") string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr") elseif(CMAKE_C_COMPILER_ID MATCHES "Clang") - # With OSL, Cycles disables rtti in some modules, wich then breaks at linking + # With OSL, Cycles disables rtti in some modules, which then breaks at linking # when trying to use vptr sanitizer (included into 'undefined' general option). string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=vptr") string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr") @@ -729,12 +740,14 @@ cycles_add_library(cycles_kernel "${LIB}" ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_OPTIX} + ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_HEADERS} ${SRC_KERNEL_DEVICE_CPU_HEADERS} ${SRC_KERNEL_DEVICE_GPU_HEADERS} ${SRC_KERNEL_DEVICE_CUDA_HEADERS} ${SRC_KERNEL_DEVICE_HIP_HEADERS} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS} + ${SRC_KERNEL_DEVICE_METAL_HEADERS} ) source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS}) @@ -746,6 +759,7 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_ source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS}) source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS}) source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}) +source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS}) source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS}) source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS}) source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS}) @@ -778,6 +792,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/film) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator) diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h index 8686f887021..26ba136dd79 100644 --- a/intern/cycles/kernel/bvh/util.h +++ b/intern/cycles/kernel/bvh/util.h @@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection * swapped = false; for (int j = 0; j < num_hits - 1; ++j) { if (hits[j].t > hits[j + 1].t) { - struct Intersection tmp_hit = hits[j]; + Intersection tmp_hit = hits[j]; float3 tmp_Ng = Ng[j]; hits[j] = hits[j + 1]; Ng[j] = Ng[j + 1]; diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h index dd0327b3f94..746e48b9880 100644 --- a/intern/cycles/kernel/device/cpu/globals.h +++ b/intern/cycles/kernel/device/cpu/globals.h @@ -18,6 +18,7 @@ #pragma once +#include "kernel/tables.h" #include "kernel/types.h" #include "kernel/util/profiling.h" diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h index 1ee82e6eb7c..658dec102b1 100644 --- a/intern/cycles/kernel/device/cuda/compat.h +++ b/intern/cycles/kernel/device/cuda/compat.h @@ -52,8 +52,9 @@ typedef unsigned long long uint64_t; #endif #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device #define ccl_global -#define ccl_static_constant __constant__ +#define ccl_inline_constant __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_constant const #define ccl_gpu_shared __shared__ @@ -75,6 +76,7 @@ typedef unsigned long long uint64_t; #define ccl_gpu_block_idx_x (blockIdx.x) #define ccl_gpu_grid_dim_x (gridDim.x) #define ccl_gpu_warp_size (warpSize) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) @@ -84,7 +86,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h index 46196dcdb51..003881d7912 100644 --- a/intern/cycles/kernel/device/cuda/config.h +++ b/intern/cycles/kernel/device/cuda/config.h @@ -92,12 +92,29 @@ /* Compute number of threads per block and minimum blocks per multiprocessor * given the maximum number of registers per thread. */ - #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ extern "C" __global__ void __launch_bounds__(block_num_threads, \ GPU_MULTIPRESSOR_MAX_REGISTERS / \ (block_num_threads * thread_num_registers)) +#define ccl_gpu_kernel_threads(block_num_threads) \ + extern "C" __global__ void __launch_bounds__(block_num_threads) + +#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__) + +#define ccl_gpu_kernel_call(x) x + +/* Define a function object where "func" is the lambda body, and additional parameters are used to + * specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda { \ + __VA_ARGS__; \ + __device__ int operator()(const int state) \ + { \ + return (func); \ + } \ + } ccl_gpu_kernel_lambda_pass + /* sanity checks */ #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h index 95a37c693ae..0900a45c83d 100644 --- a/intern/cycles/kernel/device/gpu/image.h +++ b/intern/cycles/kernel/device/gpu/image.h @@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a) /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */ template<typename T> -ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y) +ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info, + float x, + float y) { ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; @@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f /* Fast tricubic texture lookup using 8 trilinear lookups. */ template<typename T> ccl_device_noinline T -kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z) +kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z) { ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data; @@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl template<typename T> ccl_device_noinline T kernel_tex_image_interp_nanovdb( - const TextureInfo &info, float x, float y, float z, uint interpolation) + ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation) { using namespace nanovdb; @@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb( ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y) { - const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id); /* float4, byte4, ushort4 and half4 */ const int texture_type = info.data_type; @@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg, float3 P, InterpolationType interp) { - const TextureInfo &info = kernel_tex_fetch(__texture_info, id); + ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id); if (info.use_transform_3d) { P = transform_point(&info.transform_3d, P); diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 56fcc38b907..22e2a61a06d 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -21,6 +21,13 @@ #include "kernel/device/gpu/parallel_sorted_index.h" #include "kernel/device/gpu/work_stealing.h" +/* Include constant tables before entering Metal's context class scope (context_begin.h) */ +#include "kernel/tables.h" + +#ifdef __KERNEL_METAL__ +# include "kernel/device/metal/context_begin.h" +#endif + #include "kernel/integrator/state.h" #include "kernel/integrator/state_flow.h" #include "kernel/integrator/state_util.h" @@ -40,6 +47,11 @@ #include "kernel/bake/bake.h" #include "kernel/film/adaptive_sampling.h" + +#ifdef __KERNEL_METAL__ +# include "kernel/device/metal/context_end.h" +#endif + #include "kernel/film/read.h" /* -------------------------------------------------------------------- @@ -47,7 +59,7 @@ */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_reset(int num_states) + ccl_gpu_kernel_signature(integrator_reset, int num_states) { const int state = ccl_gpu_global_id_x(); @@ -58,10 +70,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles, - const int num_tiles, - float *render_buffer, - const int max_tile_work_size) + ccl_gpu_kernel_signature(integrator_init_from_camera, + ccl_global KernelWorkTile *tiles, + const int num_tiles, + ccl_global float *render_buffer, + const int max_tile_work_size) { const int work_index = ccl_gpu_global_id_x(); @@ -72,7 +85,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int tile_index = work_index / max_tile_work_size; const int tile_work_index = work_index - tile_index * max_tile_work_size; - const KernelWorkTile *tile = &tiles[tile_index]; + ccl_global const KernelWorkTile *tile = &tiles[tile_index]; if (tile_work_index >= tile->work_size) { return; @@ -83,14 +96,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) uint x, y, sample; get_work_pixel(tile, tile_work_index, &x, &y, &sample); - integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample); + ccl_gpu_kernel_call( + integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample)); } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles, - const int num_tiles, - float *render_buffer, - const int max_tile_work_size) + ccl_gpu_kernel_signature(integrator_init_from_bake, + ccl_global KernelWorkTile *tiles, + const int num_tiles, + ccl_global float *render_buffer, + const int max_tile_work_size) { const int work_index = ccl_gpu_global_id_x(); @@ -101,7 +116,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int tile_index = work_index / max_tile_work_size; const int tile_work_index = work_index - tile_index * max_tile_work_size; - const KernelWorkTile *tile = &tiles[tile_index]; + ccl_global const KernelWorkTile *tile = &tiles[tile_index]; if (tile_work_index >= tile->work_size) { return; @@ -112,230 +127,264 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) uint x, y, sample; get_work_pixel(tile, tile_work_index, &x, &y, &sample); - integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample); + ccl_gpu_kernel_call( + integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample)); } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_closest(const int *path_index_array, - ccl_global float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_closest, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_closest(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_shadow, + ccl_global const int *path_index_array, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_shadow(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_subsurface, + ccl_global const int *path_index_array, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_subsurface(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size) + ccl_gpu_kernel_signature(integrator_intersect_volume_stack, + ccl_global const int *path_index_array, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_intersect_volume_stack(NULL, state); + ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_background(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_background, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_background(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_light(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_light, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_light(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_shadow(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_shadow, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_shadow(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_surface(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_surface, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_surface(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_surface_raytrace, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_surface_raytrace(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shade_volume(const int *path_index_array, - float *render_buffer, - const int work_size) + ccl_gpu_kernel_signature(integrator_shade_volume, + ccl_global const int *path_index_array, + ccl_global float *render_buffer, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); if (global_index < work_size) { const int state = (path_index_array) ? path_index_array[global_index] : global_index; - integrator_shade_volume(NULL, state, render_buffer); + ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer)); } } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_queued_paths_array(int num_states, - int *indices, - int *num_indices, - int kernel) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_queued_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int kernel_index) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index, + int kernel_index); + ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [kernel](const int state) { - return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_queued_shadow_paths_array(int num_states, - int *indices, - int *num_indices, - int kernel) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int kernel_index) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index, + int kernel_index); + ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [kernel](const int state) { - return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_active_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0); + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [](const int state) { - return (INTEGRATOR_STATE(state, path, queued_kernel) != 0); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_terminated_paths_array(int num_states, - int *indices, - int *num_indices, - int indices_offset) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_terminated_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int indices_offset) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0); + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices + indices_offset, num_indices, [](const int state) { - return (INTEGRATOR_STATE(state, path, queued_kernel) == 0); - }); + num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_terminated_shadow_paths_array(int num_states, - int *indices, - int *num_indices, - int indices_offset) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int indices_offset) { + ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices + indices_offset, num_indices, [](const int state) { - return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); - }); -} - -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_sorted_paths_array(int num_states, - int num_states_limit, - int *indices, - int *num_indices, - int *key_counter, - int *key_prefix_sum, - int kernel) -{ - gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, - num_states_limit, - indices, - num_indices, - key_counter, - key_prefix_sum, - [kernel](const int state) { - return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ? - INTEGRATOR_STATE(state, path, shader_sort_key) : - GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; - }); -} - -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_paths_array(int num_states, - int *indices, - int *num_indices, - int num_active_paths) -{ + num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass); +} + +ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_sorted_paths_array, + int num_states, + int num_states_limit, + ccl_global int *indices, + ccl_global int *num_indices, + ccl_global int *key_counter, + ccl_global int *key_prefix_sum, + int kernel_index) +{ + ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ? + INTEGRATOR_STATE(state, path, shader_sort_key) : + GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY, + int kernel_index); + ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index; + + const uint state_index = ccl_gpu_global_id_x(); + gpu_parallel_sorted_index_array(state_index, + num_states, + num_states_limit, + indices, + num_indices, + key_counter, + key_prefix_sum, + ccl_gpu_kernel_lambda_pass); +} + +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int num_active_paths) +{ + ccl_gpu_kernel_lambda((state >= num_active_paths) && + (INTEGRATOR_STATE(state, path, queued_kernel) != 0), + int num_active_paths); + ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [num_active_paths](const int state) { - return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_states(const int *active_terminated_states, - const int active_states_offset, - const int terminated_states_offset, - const int work_size) +ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_states, + ccl_global const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); @@ -343,28 +392,32 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B const int from_state = active_terminated_states[active_states_offset + global_index]; const int to_state = active_terminated_states[terminated_states_offset + global_index]; - integrator_state_move(NULL, to_state, from_state); + ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state)); } } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_shadow_paths_array(int num_states, - int *indices, - int *num_indices, - int num_active_paths) +ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array, + int num_states, + ccl_global int *indices, + ccl_global int *num_indices, + int num_active_paths) { + ccl_gpu_kernel_lambda((state >= num_active_paths) && + (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0), + int num_active_paths); + ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths; + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( - num_states, indices, num_indices, [num_active_paths](const int state) { - return (state >= num_active_paths) && - (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0); - }); + num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass); } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) - kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states, - const int active_states_offset, - const int terminated_states_offset, - const int work_size) +ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + ccl_gpu_kernel_signature(integrator_compact_shadow_states, + ccl_global const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) { const int global_index = ccl_gpu_global_id_x(); @@ -372,15 +425,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B const int from_state = active_terminated_states[active_states_offset + global_index]; const int to_state = active_terminated_states[terminated_states_offset + global_index]; - integrator_shadow_state_move(NULL, to_state, from_state); + ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state)); } } -extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) - kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values) +ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature( + prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values) { - gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>( - counter, prefix_sum, num_values); + gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values); } /* -------------------------------------------------------------------- @@ -388,16 +440,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer, - int sx, - int sy, - int sw, - int sh, - float threshold, - bool reset, - int offset, - int stride, - uint *num_active_pixels) + ccl_gpu_kernel_signature(adaptive_sampling_convergence_check, + ccl_global float *render_buffer, + int sx, + int sy, + int sw, + int sh, + float threshold, + bool reset, + int offset, + int stride, + ccl_global uint *num_active_pixels) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / sw; @@ -406,37 +459,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) bool converged = true; if (x < sw && y < sh) { - converged = kernel_adaptive_sampling_convergence_check( - nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride); + converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check( + nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride)); } /* NOTE: All threads specified in the mask must execute the intrinsic. */ - const uint num_active_pixels_mask = ccl_gpu_ballot(!converged); + const auto num_active_pixels_mask = ccl_gpu_ballot(!converged); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { - atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask)); + atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_adaptive_sampling_filter_x( - float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) + ccl_gpu_kernel_signature(adaptive_sampling_filter_x, + ccl_global float *render_buffer, + int sx, + int sy, + int sw, + int sh, + int offset, + int stride) { const int y = ccl_gpu_global_id_x(); if (y < sh) { - kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride); + ccl_gpu_kernel_call( + kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride)); } } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_adaptive_sampling_filter_y( - float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride) + ccl_gpu_kernel_signature(adaptive_sampling_filter_y, + ccl_global float *render_buffer, + int sx, + int sy, + int sw, + int sh, + int offset, + int stride) { const int x = ccl_gpu_global_id_x(); if (x < sw) { - kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride); + ccl_gpu_kernel_call( + kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride)); } } @@ -445,12 +512,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels) + ccl_gpu_kernel_signature(cryptomatte_postprocess, + ccl_global float *render_buffer, + int num_pixels) { const int pixel_index = ccl_gpu_global_id_x(); if (pixel_index < num_pixels) { - kernel_cryptomatte_post(nullptr, render_buffer, pixel_index); + ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index)); } } @@ -458,36 +527,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) * Film. */ -/* Common implementation for float destination. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert, - float *pixels, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int dst_offset, - int dst_stride, - const Processor &processor) -{ - const int render_pixel_index = ccl_gpu_global_id_x(); - if (render_pixel_index >= num_pixels) { - return; - } - - const int x = render_pixel_index % width; - const int y = render_pixel_index / width; - - ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride + - y * stride * kfilm_convert->pass_stride; - - ccl_global float *pixel = pixels + - (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride; - - processor(kfilm_convert, buffer, pixel); -} - ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba, const int rgba_offset, const int rgba_stride, @@ -508,177 +547,95 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb #endif } -/* Common implementation for half4 destination and 4-channel input pass. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba( - const KernelFilmConvert *kfilm_convert, - uchar4 *rgba, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int rgba_offset, - int rgba_stride, - const Processor &processor) -{ - const int render_pixel_index = ccl_gpu_global_id_x(); - if (render_pixel_index >= num_pixels) { - return; - } - - const int x = render_pixel_index % width; - const int y = render_pixel_index / width; - - ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride + - y * stride * kfilm_convert->pass_stride; - - float pixel[4]; - processor(kfilm_convert, buffer, pixel); - - film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel); - - const half4 half_pixel = float4_to_half4_display( - make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); - kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); -} - -/* Common implementation for half4 destination and 3-channel input pass. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb( - const KernelFilmConvert *kfilm_convert, - uchar4 *rgba, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int rgba_offset, - int rgba_stride, - const Processor &processor) -{ - kernel_gpu_film_convert_half_rgba_common_rgba( - kfilm_convert, - rgba, - render_buffer, - num_pixels, - width, - offset, - stride, - rgba_offset, - rgba_stride, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - processor(kfilm_convert, buffer, pixel_rgba); - pixel_rgba[3] = 1.0f; - }); -} - -/* Common implementation for half4 destination and single channel input pass. */ -template<typename Processor> -ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value( - const KernelFilmConvert *kfilm_convert, - uchar4 *rgba, - float *render_buffer, - int num_pixels, - int width, - int offset, - int stride, - int rgba_offset, - int rgba_stride, - const Processor &processor) -{ - kernel_gpu_film_convert_half_rgba_common_rgba( - kfilm_convert, - rgba, - render_buffer, - num_pixels, - width, - offset, - stride, - rgba_offset, - rgba_stride, - [&processor](const KernelFilmConvert *kfilm_convert, - ccl_global const float *buffer, - float *pixel_rgba) { - float value; - processor(kfilm_convert, buffer, &value); - - pixel_rgba[0] = value; - pixel_rgba[1] = value; - pixel_rgba[2] = value; - pixel_rgba[3] = 1.0f; - }); -} - -#define KERNEL_FILM_CONVERT_PROC(name) \ - ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name - -#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \ - KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \ - (const KernelFilmConvert kfilm_convert, \ - float *pixels, \ - float *render_buffer, \ - int num_pixels, \ - int width, \ - int offset, \ - int stride, \ - int rgba_offset, \ - int rgba_stride) \ +#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \ + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \ + ccl_gpu_kernel_signature(film_convert_##variant, \ + const KernelFilmConvert kfilm_convert, \ + ccl_global float *pixels, \ + ccl_global float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ { \ - kernel_gpu_film_convert_common(&kfilm_convert, \ - pixels, \ - render_buffer, \ - num_pixels, \ - width, \ - offset, \ - stride, \ - rgba_offset, \ - rgba_stride, \ - film_get_pass_pixel_##variant); \ + const int render_pixel_index = ccl_gpu_global_id_x(); \ + if (render_pixel_index >= num_pixels) { \ + return; \ + } \ +\ + const int x = render_pixel_index % width; \ + const int y = render_pixel_index / width; \ +\ + ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \ + y * stride * kfilm_convert.pass_stride; \ +\ + ccl_global float *pixel = pixels + \ + (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \ +\ + film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \ } \ - KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \ - (const KernelFilmConvert kfilm_convert, \ - uchar4 *rgba, \ - float *render_buffer, \ - int num_pixels, \ - int width, \ - int offset, \ - int stride, \ - int rgba_offset, \ - int rgba_stride) \ +\ + ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \ + ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \ + const KernelFilmConvert kfilm_convert, \ + ccl_global uchar4 *rgba, \ + ccl_global float *render_buffer, \ + int num_pixels, \ + int width, \ + int offset, \ + int stride, \ + int rgba_offset, \ + int rgba_stride) \ { \ - kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \ - rgba, \ - render_buffer, \ - num_pixels, \ - width, \ - offset, \ - stride, \ - rgba_offset, \ - rgba_stride, \ - film_get_pass_pixel_##variant); \ - } - -KERNEL_FILM_CONVERT_DEFINE(depth, value) -KERNEL_FILM_CONVERT_DEFINE(mist, value) -KERNEL_FILM_CONVERT_DEFINE(sample_count, value) -KERNEL_FILM_CONVERT_DEFINE(float, value) - -KERNEL_FILM_CONVERT_DEFINE(light_path, rgb) -KERNEL_FILM_CONVERT_DEFINE(float3, rgb) - -KERNEL_FILM_CONVERT_DEFINE(motion, rgba) -KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba) -KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba) -KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba) -KERNEL_FILM_CONVERT_DEFINE(combined, rgba) -KERNEL_FILM_CONVERT_DEFINE(float4, rgba) - -#undef KERNEL_FILM_CONVERT_DEFINE -#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE -#undef KERNEL_FILM_CONVERT_PROC + const int render_pixel_index = ccl_gpu_global_id_x(); \ + if (render_pixel_index >= num_pixels) { \ + return; \ + } \ +\ + const int x = render_pixel_index % width; \ + const int y = render_pixel_index / width; \ +\ + ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \ + y * stride * kfilm_convert.pass_stride; \ +\ + float pixel[4]; \ + film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \ +\ + if (input_channel_count == 1) { \ + pixel[1] = pixel[2] = pixel[0]; \ + } \ + if (input_channel_count <= 3) { \ + pixel[3] = 1.0f; \ + } \ +\ + film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \ +\ + const half4 half_pixel = float4_to_half4_display( \ + make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \ + kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \ + } + +/* 1 channel inputs */ +KERNEL_FILM_CONVERT_VARIANT(depth, 1) +KERNEL_FILM_CONVERT_VARIANT(mist, 1) +KERNEL_FILM_CONVERT_VARIANT(sample_count, 1) +KERNEL_FILM_CONVERT_VARIANT(float, 1) + +/* 3 channel inputs */ +KERNEL_FILM_CONVERT_VARIANT(light_path, 3) +KERNEL_FILM_CONVERT_VARIANT(float3, 3) + +/* 4 channel inputs */ +KERNEL_FILM_CONVERT_VARIANT(motion, 4) +KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4) +KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4) +KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4) +KERNEL_FILM_CONVERT_VARIANT(combined, 4) +KERNEL_FILM_CONVERT_VARIANT(float4, 4) + +#undef KERNEL_FILM_CONVERT_VARIANT /* -------------------------------------------------------------------- * Shader evaluation. @@ -687,42 +644,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba) /* Displacement */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input, - float *output, - const int offset, - const int work_size) + ccl_gpu_kernel_signature(shader_eval_displace, + ccl_global KernelShaderEvalInput *input, + ccl_global float *output, + const int offset, + const int work_size) { int i = ccl_gpu_global_id_x(); if (i < work_size) { - kernel_displace_evaluate(NULL, input, output, offset + i); + ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i)); } } /* Background */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_shader_eval_background(KernelShaderEvalInput *input, - float *output, - const int offset, - const int work_size) + ccl_gpu_kernel_signature(shader_eval_background, + ccl_global KernelShaderEvalInput *input, + ccl_global float *output, + const int offset, + const int work_size) { int i = ccl_gpu_global_id_x(); if (i < work_size) { - kernel_background_evaluate(NULL, input, output, offset + i); + ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i)); } } /* Curve Shadow Transparency */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input, - float *output, - const int offset, - const int work_size) + ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency, + ccl_global KernelShaderEvalInput *input, + ccl_global float *output, + const int offset, + const int work_size) { int i = ccl_gpu_global_id_x(); if (i < work_size) { - kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i); + ccl_gpu_kernel_call( + kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i)); } } @@ -731,15 +692,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_color_preprocess(float *render_buffer, - int full_x, - int full_y, - int width, - int height, - int offset, - int stride, - int pass_stride, - int pass_denoised) + ccl_gpu_kernel_signature(filter_color_preprocess, + ccl_global float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int pass_denoised) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / width; @@ -750,31 +712,32 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; - float *buffer = render_buffer + render_pixel_index * pass_stride; + ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride; - float *color_out = buffer + pass_denoised; + ccl_global float *color_out = buffer + pass_denoised; color_out[0] = clamp(color_out[0], 0.0f, 10000.0f); color_out[1] = clamp(color_out[1], 0.0f, 10000.0f); color_out[2] = clamp(color_out[2], 0.0f, 10000.0f); } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_guiding_preprocess(float *guiding_buffer, - int guiding_pass_stride, - int guiding_pass_albedo, - int guiding_pass_normal, - const float *render_buffer, - int render_offset, - int render_stride, - int render_pass_stride, - int render_pass_sample_count, - int render_pass_denoising_albedo, - int render_pass_denoising_normal, - int full_x, - int full_y, - int width, - int height, - int num_samples) + ccl_gpu_kernel_signature(filter_guiding_preprocess, + ccl_global float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int guiding_pass_normal, + ccl_global const float *render_buffer, + int render_offset, + int render_stride, + int render_pass_stride, + int render_pass_sample_count, + int render_pass_denoising_albedo, + int render_pass_denoising_normal, + int full_x, + int full_y, + int width, + int height, + int num_samples) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / width; @@ -785,10 +748,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t guiding_pixel_index = x + y * width; - float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride; - const float *buffer = render_buffer + render_pixel_index * render_pass_stride; + ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride; float pixel_scale; if (render_pass_sample_count == PASS_UNUSED) { @@ -802,8 +765,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (guiding_pass_albedo != PASS_UNUSED) { kernel_assert(render_pass_denoising_albedo != PASS_UNUSED); - const float *aledo_in = buffer + render_pass_denoising_albedo; - float *albedo_out = guiding_pixel + guiding_pass_albedo; + ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo; + ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo; albedo_out[0] = aledo_in[0] * pixel_scale; albedo_out[1] = aledo_in[1] * pixel_scale; @@ -814,8 +777,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (guiding_pass_normal != PASS_UNUSED) { kernel_assert(render_pass_denoising_normal != PASS_UNUSED); - const float *normal_in = buffer + render_pass_denoising_normal; - float *normal_out = guiding_pixel + guiding_pass_normal; + ccl_global const float *normal_in = buffer + render_pass_denoising_normal; + ccl_global float *normal_out = guiding_pixel + guiding_pass_normal; normal_out[0] = normal_in[0] * pixel_scale; normal_out[1] = normal_in[1] * pixel_scale; @@ -824,11 +787,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer, - int guiding_pass_stride, - int guiding_pass_albedo, - int width, - int height) + ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo, + ccl_global float *guiding_buffer, + int guiding_pass_stride, + int guiding_pass_albedo, + int width, + int height) { kernel_assert(guiding_pass_albedo != PASS_UNUSED); @@ -841,9 +805,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t guiding_pixel_index = x + y * width; - float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; + ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride; - float *albedo_out = guiding_pixel + guiding_pass_albedo; + ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo; albedo_out[0] = 0.5f; albedo_out[1] = 0.5f; @@ -851,20 +815,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_filter_color_postprocess(float *render_buffer, - int full_x, - int full_y, - int width, - int height, - int offset, - int stride, - int pass_stride, - int num_samples, - int pass_noisy, - int pass_denoised, - int pass_sample_count, - int num_components, - bool use_compositing) + ccl_gpu_kernel_signature(filter_color_postprocess, + ccl_global float *render_buffer, + int full_x, + int full_y, + int width, + int height, + int offset, + int stride, + int pass_stride, + int num_samples, + int pass_noisy, + int pass_denoised, + int pass_sample_count, + int num_components, + bool use_compositing) { const int work_index = ccl_gpu_global_id_x(); const int y = work_index / width; @@ -875,7 +840,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) } const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride; - float *buffer = render_buffer + render_pixel_index * pass_stride; + ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride; float pixel_scale; if (pass_sample_count == PASS_UNUSED) { @@ -885,7 +850,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) pixel_scale = __float_as_uint(buffer[pass_sample_count]); } - float *denoised_pixel = buffer + pass_denoised; + ccl_global float *denoised_pixel = buffer + pass_denoised; denoised_pixel[0] *= pixel_scale; denoised_pixel[1] *= pixel_scale; @@ -898,7 +863,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) /* Currently compositing passes are either 3-component (derived by dividing light passes) * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it * simplifies logic and avoids extra memory allocation. */ - const float *noisy_pixel = buffer + pass_noisy; + ccl_global const float *noisy_pixel = buffer + pass_noisy; denoised_pixel[3] = noisy_pixel[3]; } else { @@ -914,21 +879,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) */ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) - kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states, - uint *num_possible_splits) + ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits, + int num_states, + ccl_global uint *num_possible_splits) { const int state = ccl_gpu_global_id_x(); bool can_split = false; if (state < num_states) { - can_split = kernel_shadow_catcher_path_can_split(nullptr, state); + can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state)); } /* NOTE: All threads specified in the mask must execute the intrinsic. */ - const uint can_split_mask = ccl_gpu_ballot(can_split); + const auto can_split_mask = ccl_gpu_ballot(can_split); const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size; if (lane_id == 0) { - atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask)); + atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask)); } } diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index d7416beb783..a5320edcb3c 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 #endif +#ifdef __KERNEL_METAL__ +struct ActiveIndexContext { + ActiveIndexContext(int _thread_index, + int _global_index, + int _threadgroup_size, + int _simdgroup_size, + int _simd_lane_index, + int _simd_group_index, + int _num_simd_groups, + threadgroup int *_simdgroup_offset) + : thread_index(_thread_index), + global_index(_global_index), + blocksize(_threadgroup_size), + ccl_gpu_warp_size(_simdgroup_size), + thread_warp(_simd_lane_index), + warp_index(_simd_group_index), + num_warps(_num_simd_groups), + warp_offset(_simdgroup_offset) + { + } + + const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index, + num_warps; + threadgroup int *warp_offset; + + template<uint blocksizeDummy, typename IsActiveOp> + void active_index_array(const uint num_states, + ccl_global int *indices, + ccl_global int *num_indices, + IsActiveOp is_active_op) + { + const uint state_index = global_index; +#else template<uint blocksize, typename IsActiveOp> __device__ void gpu_parallel_active_index_array(const uint num_states, - int *indices, - int *num_indices, + ccl_global int *indices, + ccl_global int *num_indices, IsActiveOp is_active_op) { extern ccl_gpu_shared int warp_offset[]; @@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states, const uint warp_index = thread_index / ccl_gpu_warp_size; const uint num_warps = blocksize / ccl_gpu_warp_size; - /* Test if state corresponding to this thread is active. */ const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index; - const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; +#endif - /* For each thread within a warp compute how many other active states precede it. */ - const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp); - const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask); + /* Test if state corresponding to this thread is active. */ + const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0; - /* Last thread in warp stores number of active states for each warp. */ - if (thread_warp == ccl_gpu_warp_size - 1) { - warp_offset[warp_index] = thread_offset + is_active; - } + /* For each thread within a warp compute how many other active states precede it. */ + const uint thread_offset = popcount(ccl_gpu_ballot(is_active) & + ccl_gpu_thread_mask(thread_warp)); - ccl_gpu_syncthreads(); - - /* Last thread in block converts per-warp sizes to offsets, increments global size of - * index array and gets offset to write to. */ - if (thread_index == blocksize - 1) { - /* TODO: parallelize this. */ - int offset = 0; - for (int i = 0; i < num_warps; i++) { - int num_active = warp_offset[i]; - warp_offset[i] = offset; - offset += num_active; + /* Last thread in warp stores number of active states for each warp. */ + if (thread_warp == ccl_gpu_warp_size - 1) { + warp_offset[warp_index] = thread_offset + is_active; } - const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; - warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); - } + ccl_gpu_syncthreads(); + + /* Last thread in block converts per-warp sizes to offsets, increments global size of + * index array and gets offset to write to. */ + if (thread_index == blocksize - 1) { + /* TODO: parallelize this. */ + int offset = 0; + for (int i = 0; i < num_warps; i++) { + int num_active = warp_offset[i]; + warp_offset[i] = offset; + offset += num_active; + } + + const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active; + warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active); + } - ccl_gpu_syncthreads(); + ccl_gpu_syncthreads(); - /* Write to index array. */ - if (is_active) { - const uint block_offset = warp_offset[num_warps]; - indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + /* Write to index array. */ + if (is_active) { + const uint block_offset = warp_offset[num_warps]; + indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index; + } } -} + +#ifdef __KERNEL_METAL__ +}; /* end class ActiveIndexContext */ + +/* inject the required thread params into a struct, and redirect to its templated member function + */ +# define gpu_parallel_active_index_array \ + ActiveIndexContext(metal_local_id, \ + metal_global_id, \ + metal_local_size, \ + simdgroup_size, \ + simd_lane_index, \ + simd_group_index, \ + num_simd_groups, \ + simdgroup_offset) \ + .active_index_array +#endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h index 6de3a022569..4bd002c27e4 100644 --- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -33,10 +33,12 @@ CCL_NAMESPACE_BEGIN # define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 #endif -template<uint blocksize> -__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values) +__device__ void gpu_parallel_prefix_sum(const int global_id, + ccl_global int *counter, + ccl_global int *prefix_sum, + const int num_values) { - if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) { + if (global_id != 0) { return; } diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index c06d7be444f..c092e2a21ee 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN #endif #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) -template<uint blocksize, typename GetKeyOp> -__device__ void gpu_parallel_sorted_index_array(const uint num_states, +template<typename GetKeyOp> +__device__ void gpu_parallel_sorted_index_array(const uint state_index, + const uint num_states, const int num_states_limit, - int *indices, - int *num_indices, - int *key_counter, - int *key_prefix_sum, + ccl_global int *indices, + ccl_global int *num_indices, + ccl_global int *key_counter, + ccl_global int *key_prefix_sum, GetKeyOp get_key_op) { - const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x; const int key = (state_index < num_states) ? get_key_op(state_index) : GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY; diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h index 282c3eca641..fff7a09e884 100644 --- a/intern/cycles/kernel/device/hip/compat.h +++ b/intern/cycles/kernel/device/hip/compat.h @@ -45,8 +45,9 @@ typedef unsigned long long uint64_t; #define ccl_device_forceinline __device__ __forceinline__ #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device #define ccl_global -#define ccl_static_constant __constant__ +#define ccl_inline_constant __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_constant const #define ccl_gpu_shared __shared__ @@ -74,6 +75,7 @@ typedef unsigned long long uint64_t; #define ccl_gpu_block_idx_x (blockIdx.x) #define ccl_gpu_grid_dim_x (gridDim.x) #define ccl_gpu_warp_size (warpSize) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) @@ -83,7 +85,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot(predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ typedef hipTextureObject_t ccl_gpu_tex_object; diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h index 2fde0d46015..7ec744d8ad2 100644 --- a/intern/cycles/kernel/device/hip/config.h +++ b/intern/cycles/kernel/device/hip/config.h @@ -35,12 +35,29 @@ /* Compute number of threads per block and minimum blocks per multiprocessor * given the maximum number of registers per thread. */ - #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ extern "C" __global__ void __launch_bounds__(block_num_threads, \ GPU_MULTIPRESSOR_MAX_REGISTERS / \ (block_num_threads * thread_num_registers)) +#define ccl_gpu_kernel_threads(block_num_threads) \ + extern "C" __global__ void __launch_bounds__(block_num_threads) + +#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__) + +#define ccl_gpu_kernel_call(x) x + +/* Define a function object where "func" is the lambda body, and additional parameters are used to + * specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda { \ + __VA_ARGS__; \ + __device__ int operator()(const int state) \ + { \ + return (func); \ + } \ + } ccl_gpu_kernel_lambda_pass + /* sanity checks */ #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 77cea30914c..61597a4acfc 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -34,6 +34,7 @@ using namespace metal; #pragma clang diagnostic ignored "-Wunused-variable" #pragma clang diagnostic ignored "-Wsign-compare" +#pragma clang diagnostic ignored "-Wuninitialized" /* Qualifiers */ @@ -42,8 +43,9 @@ using namespace metal; #define ccl_device_forceinline ccl_device #define ccl_device_noinline ccl_device __attribute__((noinline)) #define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device #define ccl_global device -#define ccl_static_constant static constant constexpr +#define ccl_inline_constant static constant constexpr #define ccl_device_constant constant #define ccl_constant const device #define ccl_gpu_shared threadgroup @@ -58,6 +60,122 @@ using namespace metal; #define kernel_assert(cond) +#define ccl_gpu_global_id_x() metal_global_id +#define ccl_gpu_warp_size simdgroup_size +#define ccl_gpu_thread_idx_x simd_group_index +#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1) + +#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate))) +#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup); + +// clang-format off + +/* kernel.h adapters */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) +#define ccl_gpu_kernel_threads(block_num_threads) + +/* Convert a comma-separated list into a semicolon-separated list + * (so that we can generate a struct based on kernel entry-point parameters). */ +#define FN0() +#define FN1(p1) p1; +#define FN2(p1, p2) p1; p2; +#define FN3(p1, p2, p3) p1; p2; p3; +#define FN4(p1, p2, p3, p4) p1; p2; p3; p4; +#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5; +#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6; +#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7; +#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8; +#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9; +#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; +#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; +#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; +#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; +#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; +#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; +#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; +#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16 +#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0) + +/* Generate a struct containing the entry-point parameters and a "run" + * method which can access them implicitly via this-> */ +#define ccl_gpu_kernel_signature(name, ...) \ +struct kernel_gpu_##name \ +{ \ + PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \ + void run(thread MetalKernelContext& context, \ + threadgroup int *simdgroup_offset, \ + const uint metal_global_id, \ + const ushort metal_local_id, \ + const ushort metal_local_size, \ + uint simdgroup_size, \ + uint simd_lane_index, \ + uint simd_group_index, \ + uint num_simd_groups) ccl_global const; \ +}; \ +kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \ + constant KernelParamsMetal &ccl_restrict _launch_params_metal, \ + constant MetalAncillaries *_metal_ancillaries, \ + threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \ + const uint metal_global_id [[thread_position_in_grid]], \ + const ushort metal_local_id [[thread_position_in_threadgroup]], \ + const ushort metal_local_size [[threads_per_threadgroup]], \ + uint simdgroup_size [[threads_per_simdgroup]], \ + uint simd_lane_index [[thread_index_in_simdgroup]], \ + uint simd_group_index [[simdgroup_index_in_threadgroup]], \ + uint num_simd_groups [[simdgroups_per_threadgroup]]) { \ + MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \ + params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \ +} \ +void kernel_gpu_##name::run(thread MetalKernelContext& context, \ + threadgroup int *simdgroup_offset, \ + const uint metal_global_id, \ + const ushort metal_local_id, \ + const ushort metal_local_size, \ + uint simdgroup_size, \ + uint simd_lane_index, \ + uint simd_group_index, \ + uint num_simd_groups) ccl_global const + +#define ccl_gpu_kernel_call(x) context.x + +/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda \ + { \ + KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \ + ccl_private MetalKernelContext &context; \ + __VA_ARGS__; \ + int operator()(const int state) const { return (func); } \ + } ccl_gpu_kernel_lambda_pass(context) + +// clang-format on + +/* volumetric lambda functions - use function objects for lambda-like functionality */ +#define VOLUME_READ_LAMBDA(function_call) \ + struct FnObjectRead { \ + KernelGlobals kg; \ + ccl_private MetalKernelContext *context; \ + int state; \ +\ + VolumeStack operator()(const int i) const \ + { \ + return context->function_call; \ + } \ + } volume_read_lambda_pass{kg, this, state}; + +#define VOLUME_WRITE_LAMBDA(function_call) \ + struct FnObjectWrite { \ + KernelGlobals kg; \ + ccl_private MetalKernelContext *context; \ + int state; \ +\ + void operator()(const int i, VolumeStack entry) const \ + { \ + context->function_call; \ + } \ + } volume_write_lambda_pass{kg, this, state}; + /* make_type definitions with Metal style element initializers */ #ifdef make_float2 # undef make_float2 @@ -112,6 +230,7 @@ using namespace metal; #define sinhf(x) sinh(float(x)) #define coshf(x) cosh(float(x)) #define tanhf(x) tanh(float(x)) +#define saturatef(x) saturate(float(x)) /* Use native functions with possibly lower precision for performance, * no issues found so far. */ @@ -124,3 +243,43 @@ using namespace metal; #define logf(x) trigmode::log(float(x)) #define NULL 0 + +#define __device__ + +/* texture bindings and sampler setup */ + +struct Texture2DParamsMetal { + texture2d<float, access::sample> tex; +}; +struct Texture3DParamsMetal { + texture3d<float, access::sample> tex; +}; + +struct MetalAncillaries { + device Texture2DParamsMetal *textures_2d; + device Texture3DParamsMetal *textures_3d; +}; + +#include "util/half.h" +#include "util/types.h" + +enum SamplerType { + SamplerFilterNearest_AddressRepeat, + SamplerFilterNearest_AddressClampEdge, + SamplerFilterNearest_AddressClampZero, + + SamplerFilterLinear_AddressRepeat, + SamplerFilterLinear_AddressClampEdge, + SamplerFilterLinear_AddressClampZero, + + SamplerCount +}; + +constant constexpr array<sampler, SamplerCount> metal_samplers = { + sampler(address::repeat, filter::nearest), + sampler(address::clamp_to_edge, filter::nearest), + sampler(address::clamp_to_zero, filter::nearest), + sampler(address::repeat, filter::linear), + sampler(address::clamp_to_edge, filter::linear), + sampler(address::clamp_to_zero, filter::linear), +}; diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h new file mode 100644 index 00000000000..8c9e1c54077 --- /dev/null +++ b/intern/cycles/kernel/device/metal/context_begin.h @@ -0,0 +1,79 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// clang-format off + +/* Open the Metal kernel context class + * Necessary to access resource bindings */ +class MetalKernelContext { + public: + constant KernelParamsMetal &launch_params_metal; + constant MetalAncillaries *metal_ancillaries; + + MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries) + : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries) + {} + + /* texture fetch adapter functions */ + typedef uint64_t ccl_gpu_tex_object; + + template<typename T> + inline __attribute__((__always_inline__)) + T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const { + kernel_assert(0); + return 0; + } + template<typename T> + inline __attribute__((__always_inline__)) + T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const { + kernel_assert(0); + return 0; + } + + // texture2d + template<> + inline __attribute__((__always_inline__)) + float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)); + } + template<> + inline __attribute__((__always_inline__)) + float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x; + } + + // texture3d + template<> + inline __attribute__((__always_inline__)) + float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)); + } + template<> + inline __attribute__((__always_inline__)) + float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const { + const uint tid(tex); + const uint sid(tex >> 32); + return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x; + } +# include "kernel/device/gpu/image.h" + + // clang-format on diff --git a/intern/cycles/kernel/device/metal/context_end.h b/intern/cycles/kernel/device/metal/context_end.h new file mode 100644 index 00000000000..e700f294440 --- /dev/null +++ b/intern/cycles/kernel/device/metal/context_end.h @@ -0,0 +1,23 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +} +; /* end of MetalKernelContext class definition */ + +/* Silently redirect into the MetalKernelContext instance */ +/* NOTE: These macros will need maintaining as entry-points change. */ + +#undef kernel_integrator_state +#define kernel_integrator_state context.launch_params_metal.__integrator_state diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h new file mode 100644 index 00000000000..1aea36589d0 --- /dev/null +++ b/intern/cycles/kernel/device/metal/globals.h @@ -0,0 +1,51 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#include "kernel/types.h" +#include "kernel/util/profiling.h" + +#include "kernel/integrator/state.h" + +CCL_NAMESPACE_BEGIN + +typedef struct KernelParamsMetal { + +#define KERNEL_TEX(type, name) ccl_global const type *name; +#include "kernel/textures.h" +#undef KERNEL_TEX + + const IntegratorStateGPU __integrator_state; + const KernelData data; + +} KernelParamsMetal; + +typedef struct KernelGlobalsGPU { + int unused[1]; +} KernelGlobalsGPU; + +typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals; + +#define kernel_data launch_params_metal.data +#define kernel_integrator_state launch_params_metal.__integrator_state + +/* data lookup defines */ + +#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index] +#define kernel_tex_array(tex) launch_params_metal.tex + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal new file mode 100644 index 00000000000..feca20ff475 --- /dev/null +++ b/intern/cycles/kernel/device/metal/kernel.metal @@ -0,0 +1,25 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Metal kernel entry points */ + +// clang-format off + +#include "kernel/device/metal/compat.h" +#include "kernel/device/metal/globals.h" +#include "kernel/device/gpu/kernel.h" + +// clang-format on
\ No newline at end of file diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h index 835e4621d47..0619c135c39 100644 --- a/intern/cycles/kernel/device/optix/compat.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -49,10 +49,11 @@ typedef unsigned long long uint64_t; __device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything #define ccl_device_inline ccl_device #define ccl_device_forceinline ccl_device +#define ccl_device_inline_method ccl_device #define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline_cpu ccl_device #define ccl_global -#define ccl_static_constant __constant__ +#define ccl_inline_constant __constant__ #define ccl_device_constant __constant__ __device__ #define ccl_constant const #define ccl_gpu_shared __shared__ @@ -76,6 +77,7 @@ typedef unsigned long long uint64_t; #define ccl_gpu_block_idx_x (blockIdx.x) #define ccl_gpu_grid_dim_x (gridDim.x) #define ccl_gpu_warp_size (warpSize) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) @@ -85,7 +87,6 @@ typedef unsigned long long uint64_t; #define ccl_gpu_syncthreads() __syncthreads() #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate) #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla) -#define ccl_gpu_popc(x) __popc(x) /* GPU texture objects */ diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu index b987aa7a817..70b977b3d84 100644 --- a/intern/cycles/kernel/device/optix/kernel.cu +++ b/intern/cycles/kernel/device/optix/kernel.cu @@ -21,6 +21,8 @@ #include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */ +#include "kernel/tables.h" + #include "kernel/integrator/state.h" #include "kernel/integrator/state_flow.h" #include "kernel/integrator/state_util.h" @@ -29,9 +31,11 @@ #include "kernel/integrator/intersect_shadow.h" #include "kernel/integrator/intersect_subsurface.h" #include "kernel/integrator/intersect_volume_stack.h" - // clang-format on +#define OPTIX_DEFINE_ABI_VERSION_ONLY +#include <optix_function_table.h> + template<typename T> ccl_device_forceinline T *get_payload_ptr_0() { return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1()); @@ -44,7 +48,7 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2() ccl_device_forceinline int get_object_id() { #ifdef __OBJECT_MOTION__ - /* Always get the the instance ID from the TLAS + /* Always get the instance ID from the TLAS * There might be a motion transform node between TLAS and BLAS which does not have one. */ return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0)); #else @@ -159,9 +163,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit() /* Record geometric normal. */ const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w; - const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)); - const float3 tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)); - const float3 tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2)); + const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0); + const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1); + const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2); local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); /* Continue tracing (without this the trace call would return after the first hit). */ @@ -198,10 +202,12 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() type = segment.type; prim = segment.prim; +# if OPTIX_ABI_VERSION < 55 /* Filter out curve endcaps. */ if (u == 0.0f || u == 1.0f) { return optixIgnoreIntersection(); } +# endif } # endif @@ -308,6 +314,7 @@ extern "C" __global__ void __anyhit__kernel_optix_volume_test() extern "C" __global__ void __anyhit__kernel_optix_visibility_test() { #ifdef __HAIR__ +# if OPTIX_ABI_VERSION < 55 if (!optixIsTriangleHit()) { /* Filter out curve endcaps. */ const float u = __uint_as_float(optixGetAttribute_0()); @@ -315,6 +322,7 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test() return optixIgnoreIntersection(); } } +# endif #endif #ifdef __VISIBILITY_FLAG__ diff --git a/intern/cycles/kernel/film/accumulate.h b/intern/cycles/kernel/film/accumulate.h index d66d7d6fb70..fb52b1cd05f 100644 --- a/intern/cycles/kernel/film/accumulate.h +++ b/intern/cycles/kernel/film/accumulate.h @@ -151,7 +151,8 @@ ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer( ccl_device_inline int kernel_accum_sample(KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer, - int sample) + int sample, + int sample_offset) { if (kernel_data.film.pass_sample_count == PASS_UNUSED) { return sample; @@ -159,7 +160,9 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg, ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer); - return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1); + return atomic_fetch_and_add_uint32( + (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) + + sample_offset; } ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg, @@ -550,7 +553,7 @@ ccl_device_inline void kernel_accum_background(KernelGlobals kg, const bool is_transparent_background_ray, ccl_global float *ccl_restrict render_buffer) { - float3 contribution = INTEGRATOR_STATE(state, path, throughput) * L; + float3 contribution = float3(INTEGRATOR_STATE(state, path, throughput)) * L; kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1); ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer); diff --git a/intern/cycles/kernel/geom/attribute.h b/intern/cycles/kernel/geom/attribute.h index 848e0430caa..a7ac2bd926f 100644 --- a/intern/cycles/kernel/geom/attribute.h +++ b/intern/cycles/kernel/geom/attribute.h @@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN * Lookup of attributes is different between OSL and SVM, as OSL is ustring * based while for SVM we use integer ids. */ -ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd); +/* Patch index for triangle, -1 if not subdivision triangle */ + +ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd) +{ + return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0; +} ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd) { @@ -106,9 +111,9 @@ ccl_device Transform primitive_attribute_matrix(KernelGlobals kg, { Transform tfm; - tfm.x = kernel_tex_fetch(__attributes_float3, desc.offset + 0); - tfm.y = kernel_tex_fetch(__attributes_float3, desc.offset + 1); - tfm.z = kernel_tex_fetch(__attributes_float3, desc.offset + 2); + tfm.x = kernel_tex_fetch(__attributes_float4, desc.offset + 0); + tfm.y = kernel_tex_fetch(__attributes_float4, desc.offset + 1); + tfm.z = kernel_tex_fetch(__attributes_float4, desc.offset + 2); return tfm; } diff --git a/intern/cycles/kernel/geom/curve.h b/intern/cycles/kernel/geom/curve.h index 7271193eef8..4b6eecf9640 100644 --- a/intern/cycles/kernel/geom/curve.h +++ b/intern/cycles/kernel/geom/curve.h @@ -126,8 +126,8 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg, int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; - float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0)); - float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1)); + float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0); + float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1); # ifdef __RAY_DIFFERENTIALS__ if (dx) @@ -149,7 +149,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg, if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) { const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim : desc.offset; - return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset)); + return kernel_tex_fetch(__attributes_float3, offset); } else { return make_float3(0.0f, 0.0f, 0.0f); @@ -168,8 +168,8 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg, int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; - float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0); - float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1); + float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + k0); + float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + k1); # ifdef __RAY_DIFFERENTIALS__ if (dx) @@ -191,7 +191,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg, if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) { const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim : desc.offset; - return kernel_tex_fetch(__attributes_float3, offset); + return kernel_tex_fetch(__attributes_float4, offset); } else { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/motion_curve.h b/intern/cycles/kernel/geom/motion_curve.h index 2dd213d43f6..8358c94360f 100644 --- a/intern/cycles/kernel/geom/motion_curve.h +++ b/intern/cycles/kernel/geom/motion_curve.h @@ -48,8 +48,8 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals kg, offset += step * numkeys; - keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0); - keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1); + keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0); + keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1); } } @@ -106,10 +106,10 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals kg, offset += step * numkeys; - keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0); - keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1); - keys[2] = kernel_tex_fetch(__attributes_float3, offset + k2); - keys[3] = kernel_tex_fetch(__attributes_float3, offset + k3); + keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0); + keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1); + keys[2] = kernel_tex_fetch(__attributes_float4, offset + k2); + keys[3] = kernel_tex_fetch(__attributes_float4, offset + k3); } } diff --git a/intern/cycles/kernel/geom/motion_triangle.h b/intern/cycles/kernel/geom/motion_triangle.h index 43f894938e0..62b7b630c89 100644 --- a/intern/cycles/kernel/geom/motion_triangle.h +++ b/intern/cycles/kernel/geom/motion_triangle.h @@ -43,9 +43,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg, { if (step == numsteps) { /* center step: regular vertex location */ - verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0)); - verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1)); - verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2)); + verts[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0); + verts[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1); + verts[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2); } else { /* center step not store in this array */ @@ -54,9 +54,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg, offset += step * numverts; - verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x)); - verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y)); - verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z)); + verts[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x); + verts[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y); + verts[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z); } } @@ -70,9 +70,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg, { if (step == numsteps) { /* center step: regular vertex location */ - normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x)); - normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); - normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); + normals[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x); + normals[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y); + normals[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z); } else { /* center step is not stored in this array */ @@ -81,9 +81,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg, offset += step * numverts; - normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x)); - normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y)); - normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z)); + normals[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x); + normals[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y); + normals[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z); } } diff --git a/intern/cycles/kernel/geom/motion_triangle_intersect.h b/intern/cycles/kernel/geom/motion_triangle_intersect.h index 256e7add21e..72ad237eeeb 100644 --- a/intern/cycles/kernel/geom/motion_triangle_intersect.h +++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h @@ -163,19 +163,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg, motion_triangle_vertices(kg, fobject, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if (ray_triangle_intersect(P, - dir, - tmax, -#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - (ssef *)verts, -#else - verts[0], - verts[1], - verts[2], -#endif - &u, - &v, - &t)) { + if (ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) { #ifdef __VISIBILITY_FLAG__ /* Visibility flag test. we do it here under the assumption * that most triangles are culled by node flags. @@ -229,19 +217,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg, motion_triangle_vertices(kg, local_object, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if (!ray_triangle_intersect(P, - dir, - tmax, -# if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - (ssef *)verts, -# else - verts[0], - verts[1], - verts[2], -# endif - &u, - &v, - &t)) { + if (!ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) { return false; } diff --git a/intern/cycles/kernel/geom/patch.h b/intern/cycles/kernel/geom/patch.h index 7d24937a41e..432618aa243 100644 --- a/intern/cycles/kernel/geom/patch.h +++ b/intern/cycles/kernel/geom/patch.h @@ -380,7 +380,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals kg, *dv = make_float3(0.0f, 0.0f, 0.0f); for (int i = 0; i < num_control; i++) { - float3 v = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + indices[i])); + float3 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]); val += v * weights[i]; if (du) @@ -417,7 +417,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals kg, *dv = make_float4(0.0f, 0.0f, 0.0f, 0.0f); for (int i = 0; i < num_control; i++) { - float4 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]); + float4 v = kernel_tex_fetch(__attributes_float4, offset + indices[i]); val += v * weights[i]; if (du) diff --git a/intern/cycles/kernel/geom/primitive.h b/intern/cycles/kernel/geom/primitive.h index 7a8921b6d6e..6d7b550d82f 100644 --- a/intern/cycles/kernel/geom/primitive.h +++ b/intern/cycles/kernel/geom/primitive.h @@ -284,18 +284,33 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg, int numverts, numkeys; object_motion_info(kg, sd->object, NULL, &numverts, &numkeys); - /* lookup attributes */ - motion_pre = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL); - - desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE) ? numverts : numkeys; - motion_post = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL); - #ifdef __HAIR__ - if (is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { - object_position_transform(kg, sd, &motion_pre); - object_position_transform(kg, sd, &motion_post); + if (is_curve_primitive) { + motion_pre = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL)); + desc.offset += numkeys; + motion_post = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL)); + + /* Curve */ + if ((sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { + object_position_transform(kg, sd, &motion_pre); + object_position_transform(kg, sd, &motion_post); + } } + else #endif + if (sd->type & PRIMITIVE_ALL_TRIANGLE) { + /* Triangle */ + if (subd_triangle_patch(kg, sd) == ~0) { + motion_pre = triangle_attribute_float3(kg, sd, desc, NULL, NULL); + desc.offset += numverts; + motion_post = triangle_attribute_float3(kg, sd, desc, NULL, NULL); + } + else { + motion_pre = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL); + desc.offset += numverts; + motion_post = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL); + } + } } /* object motion. note that depending on the mesh having motion vectors, this diff --git a/intern/cycles/kernel/geom/subd_triangle.h b/intern/cycles/kernel/geom/subd_triangle.h index 8a9a3f71231..1b693a915bf 100644 --- a/intern/cycles/kernel/geom/subd_triangle.h +++ b/intern/cycles/kernel/geom/subd_triangle.h @@ -20,13 +20,6 @@ CCL_NAMESPACE_BEGIN -/* Patch index for triangle, -1 if not subdivision triangle */ - -ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd) -{ - return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0; -} - /* UV coords of triangle within patch */ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg, @@ -443,8 +436,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg, if (dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - return float4_to_float3( - kernel_tex_fetch(__attributes_float3, desc.offset + subd_triangle_patch_face(kg, patch))); + return kernel_tex_fetch(__attributes_float3, + desc.offset + subd_triangle_patch_face(kg, patch)); } else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { float2 uv[3]; @@ -452,10 +445,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg, uint4 v = subd_triangle_patch_indices(kg, patch); - float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.x)); - float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.y)); - float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.z)); - float3 f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.w)); + float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x); + float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y); + float3 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z); + float3 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w); if (subd_triangle_patch_num_corners(kg, patch) != 4) { f1 = (f1 + f0) * 0.5f; @@ -484,10 +477,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg, float3 f0, f1, f2, f3; - f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset)); - f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset)); - f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset)); - f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset)); + f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset); + f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset); + f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset); + f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset); if (subd_triangle_patch_num_corners(kg, patch) != 4) { f1 = (f1 + f0) * 0.5f; @@ -513,7 +506,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg, if (dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset)); + return kernel_tex_fetch(__attributes_float3, desc.offset); } else { if (dx) @@ -590,7 +583,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg, if (dy) *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - return kernel_tex_fetch(__attributes_float3, + return kernel_tex_fetch(__attributes_float4, desc.offset + subd_triangle_patch_face(kg, patch)); } else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { @@ -599,10 +592,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg, uint4 v = subd_triangle_patch_indices(kg, patch); - float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x); - float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y); - float4 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z); - float4 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w); + float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + v.x); + float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + v.y); + float4 f2 = kernel_tex_fetch(__attributes_float4, desc.offset + v.z); + float4 f3 = kernel_tex_fetch(__attributes_float4, desc.offset + v.w); if (subd_triangle_patch_num_corners(kg, patch) != 4) { f1 = (f1 + f0) * 0.5f; @@ -642,10 +635,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg, color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset))); } else { - f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset); - f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset); - f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset); - f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset); + f0 = kernel_tex_fetch(__attributes_float4, corners[0] + desc.offset); + f1 = kernel_tex_fetch(__attributes_float4, corners[1] + desc.offset); + f2 = kernel_tex_fetch(__attributes_float4, corners[2] + desc.offset); + f3 = kernel_tex_fetch(__attributes_float4, corners[3] + desc.offset); } if (subd_triangle_patch_num_corners(kg, patch) != 4) { @@ -672,7 +665,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg, if (dy) *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - return kernel_tex_fetch(__attributes_float3, desc.offset); + return kernel_tex_fetch(__attributes_float4, desc.offset); } else { if (dx) diff --git a/intern/cycles/kernel/geom/triangle.h b/intern/cycles/kernel/geom/triangle.h index 233e901c7ca..854022b3369 100644 --- a/intern/cycles/kernel/geom/triangle.h +++ b/intern/cycles/kernel/geom/triangle.h @@ -29,9 +29,9 @@ ccl_device_inline float3 triangle_normal(KernelGlobals kg, ccl_private ShaderDat { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); - const float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0)); - const float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1)); - const float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2)); + const float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0); + const float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1); + const float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2); /* return normal */ if (sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { @@ -54,9 +54,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg, { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0)); - float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1)); - float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2)); + float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0); + float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1); + float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2); /* compute point */ float t = 1.0f - u - v; *P = (u * v0 + v * v1 + t * v2); @@ -78,9 +78,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg, ccl_device_inline void triangle_vertices(KernelGlobals kg, int prim, float3 P[3]) { const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0)); - P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1)); - P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2)); + P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0); + P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1); + P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2); } /* Triangle vertex locations and vertex normals */ @@ -91,12 +91,12 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals kg, float3 N[3]) { const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0)); - P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1)); - P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2)); - N[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x)); - N[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); - N[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); + P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0); + P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1); + P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2); + N[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x); + N[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y); + N[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z); } /* Interpolate smooth vertex normal from vertices */ @@ -106,9 +106,9 @@ triangle_smooth_normal(KernelGlobals kg, float3 Ng, int prim, float u, float v) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x)); - float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); - float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); + float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x); + float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y); + float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z); float3 N = safe_normalize((1.0f - u - v) * n2 + u * n0 + v * n1); @@ -120,9 +120,9 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized( { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x)); - float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); - float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); + float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x); + float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y); + float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z); /* ensure that the normals are in object space */ if (sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED) { @@ -145,9 +145,9 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals kg, { /* fetch triangle vertex coordinates */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); - const float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0)); - const float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1)); - const float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2)); + const float3 p0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0); + const float3 p1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1); + const float3 p2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2); /* compute derivatives of P w.r.t. uv */ *dPdu = (p0 - p2); @@ -267,15 +267,15 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg, if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) { const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); - f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x)); - f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y)); - f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z)); + f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x); + f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y); + f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z); } else { const int tri = desc.offset + sd->prim * 3; - f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0)); - f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1)); - f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2)); + f0 = kernel_tex_fetch(__attributes_float3, tri + 0); + f1 = kernel_tex_fetch(__attributes_float3, tri + 1); + f2 = kernel_tex_fetch(__attributes_float3, tri + 2); } #ifdef __RAY_DIFFERENTIALS__ @@ -298,7 +298,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg, if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) { const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim : desc.offset; - return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset)); + return kernel_tex_fetch(__attributes_float3, offset); } else { return make_float3(0.0f, 0.0f, 0.0f); @@ -318,16 +318,16 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg, if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) { const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); - f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x); - f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y); - f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z); + f0 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.x); + f1 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.y); + f2 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.z); } else { const int tri = desc.offset + sd->prim * 3; if (desc.element == ATTR_ELEMENT_CORNER) { - f0 = kernel_tex_fetch(__attributes_float3, tri + 0); - f1 = kernel_tex_fetch(__attributes_float3, tri + 1); - f2 = kernel_tex_fetch(__attributes_float3, tri + 2); + f0 = kernel_tex_fetch(__attributes_float4, tri + 0); + f1 = kernel_tex_fetch(__attributes_float4, tri + 1); + f2 = kernel_tex_fetch(__attributes_float4, tri + 2); } else { f0 = color_srgb_to_linear_v4( @@ -359,7 +359,7 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg, if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) { const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim : desc.offset; - return kernel_tex_fetch(__attributes_float3, offset); + return kernel_tex_fetch(__attributes_float4, offset); } else { return make_float4(0.0f, 0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/triangle_intersect.h b/intern/cycles/kernel/geom/triangle_intersect.h index faff8a85a93..57a6ae7fe72 100644 --- a/intern/cycles/kernel/geom/triangle_intersect.h +++ b/intern/cycles/kernel/geom/triangle_intersect.h @@ -37,27 +37,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg, { const int prim = kernel_tex_fetch(__prim_index, prim_addr); const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w; -#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex]; -#else - const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0), + const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0), tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1), tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2); -#endif float t, u, v; - if (ray_triangle_intersect(P, - dir, - tmax, -#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - ssef_verts, -#else - float4_to_float3(tri_a), - float4_to_float3(tri_b), - float4_to_float3(tri_c), -#endif - &u, - &v, - &t)) { + if (ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) { #ifdef __VISIBILITY_FLAG__ /* Visibility flag test. we do it here under the assumption * that most triangles are culled by node flags. @@ -106,27 +90,11 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg, const int prim = kernel_tex_fetch(__prim_index, prim_addr); const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w; -# if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex]; -# else - const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)), - tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)), - tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2)); -# endif + const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0), + tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1), + tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2); float t, u, v; - if (!ray_triangle_intersect(P, - dir, - tmax, -# if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - ssef_verts, -# else - tri_a, - tri_b, - tri_c, -# endif - &u, - &v, - &t)) { + if (!ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) { return false; } @@ -178,11 +146,6 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg, isect->t = t; /* Record geometric normal. */ -# if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)), - tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)), - tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2)); -# endif local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); return false; @@ -223,9 +186,9 @@ ccl_device_inline float3 triangle_refine(KernelGlobals kg, P = P + D * t; const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w; - const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0), - tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1), - tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2); + const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0), + tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1), + tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2); float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); @@ -280,9 +243,9 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals kg, # ifdef __INTERSECTION_REFINE__ const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w; - const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0), - tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1), - tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2); + const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0), + tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1), + tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2); float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z); float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z); float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z); diff --git a/intern/cycles/kernel/geom/volume.h b/intern/cycles/kernel/geom/volume.h index 4e83ad6acb3..387eb2646da 100644 --- a/intern/cycles/kernel/geom/volume.h +++ b/intern/cycles/kernel/geom/volume.h @@ -75,7 +75,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg, const AttributeDescriptor desc) { if (desc.element & (ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) { - return kernel_tex_fetch(__attributes_float3, desc.offset); + return kernel_tex_fetch(__attributes_float4, desc.offset); } else if (desc.element == ATTR_ELEMENT_VOXEL) { /* todo: optimize this so we don't have to transform both here and in diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h index dbaf02836e4..f4a2fbea405 100644 --- a/intern/cycles/kernel/integrator/init_from_bake.h +++ b/intern/cycles/kernel/integrator/init_from_bake.h @@ -65,7 +65,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg, } /* Always count the sample, even if the camera sample will reject the ray. */ - const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample); + const int sample = kernel_accum_sample( + kg, state, render_buffer, scheduled_sample, tile->sample_offset); /* Setup render buffers. */ const int index = INTEGRATOR_STATE(state, path, render_pixel_index); diff --git a/intern/cycles/kernel/integrator/init_from_camera.h b/intern/cycles/kernel/integrator/init_from_camera.h index f0ba77bd9a6..59dd1a9fa75 100644 --- a/intern/cycles/kernel/integrator/init_from_camera.h +++ b/intern/cycles/kernel/integrator/init_from_camera.h @@ -89,7 +89,8 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg, * This logic allows to both count actual number of samples per pixel, and to add samples to this * pixel after it was converged and samples were added somewhere else (in which case the * `scheduled_sample` will be different from actual number of samples in this pixel). */ - const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample); + const int sample = kernel_accum_sample( + kg, state, render_buffer, scheduled_sample, tile->sample_offset); /* Initialize random number seed for path. */ const uint rng_hash = path_rng_hash_init(kg, sample, x, y); diff --git a/intern/cycles/kernel/integrator/shade_background.h b/intern/cycles/kernel/integrator/shade_background.h index 31452de1ca4..a8ebbe908ae 100644 --- a/intern/cycles/kernel/integrator/shade_background.h +++ b/intern/cycles/kernel/integrator/shade_background.h @@ -20,7 +20,6 @@ #include "kernel/integrator/shader_eval.h" #include "kernel/light/light.h" #include "kernel/light/sample.h" -#include "kernel/sample/mis.h" CCL_NAMESPACE_BEGIN @@ -81,8 +80,7 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg, /* multiple importance sampling, get background light pdf for ray * direction, and compute weight with respect to BSDF pdf */ const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D); - const float mis_weight = power_heuristic(mis_ray_pdf, pdf); - + const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf); L *= mis_weight; } # endif @@ -169,7 +167,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg, /* multiple importance sampling, get regular light pdf, * and compute weight with respect to BSDF pdf */ const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf); - const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf); + const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf); light_eval *= mis_weight; } diff --git a/intern/cycles/kernel/integrator/shade_light.h b/intern/cycles/kernel/integrator/shade_light.h index 5abe9e98abc..97ca430752c 100644 --- a/intern/cycles/kernel/integrator/shade_light.h +++ b/intern/cycles/kernel/integrator/shade_light.h @@ -84,7 +84,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg, /* multiple importance sampling, get regular light pdf, * and compute weight with respect to BSDF pdf */ const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf); - const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf); + const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf); light_eval *= mis_weight; } diff --git a/intern/cycles/kernel/integrator/shade_shadow.h b/intern/cycles/kernel/integrator/shade_shadow.h index 1de890aae29..a68fcaa7a64 100644 --- a/intern/cycles/kernel/integrator/shade_shadow.h +++ b/intern/cycles/kernel/integrator/shade_shadow.h @@ -95,8 +95,8 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg, shader_setup_from_volume(kg, shadow_sd, &ray); - const float step_size = volume_stack_step_size( - kg, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); }); + VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i)); + const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass); volume_shadow_heterogeneous(kg, state, &ray, shadow_sd, throughput, step_size); } diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h index 2793dd3e218..c9c586f5ae4 100644 --- a/intern/cycles/kernel/integrator/shade_surface.h +++ b/intern/cycles/kernel/integrator/shade_surface.h @@ -27,8 +27,6 @@ #include "kernel/light/light.h" #include "kernel/light/sample.h" -#include "kernel/sample/mis.h" - CCL_NAMESPACE_BEGIN ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg, @@ -95,8 +93,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg, /* Multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf. */ float pdf = triangle_light_pdf(kg, sd, t); - float mis_weight = power_heuristic(bsdf_pdf, pdf); - + float mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf); L *= mis_weight; } @@ -155,7 +152,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg, bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf); if (ls.shader & SHADER_USE_MIS) { - const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf); + const float mis_weight = light_sample_mis_weight_nee(kg, ls.pdf, bsdf_pdf); bsdf_eval_mul(&bsdf_eval, mis_weight); } @@ -195,12 +192,13 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg, const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval); if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) { - const float3 pass_diffuse_weight = (bounce == 0) ? - bsdf_eval_pass_diffuse_weight(&bsdf_eval) : - INTEGRATOR_STATE(state, path, pass_diffuse_weight); - const float3 pass_glossy_weight = (bounce == 0) ? - bsdf_eval_pass_glossy_weight(&bsdf_eval) : - INTEGRATOR_STATE(state, path, pass_glossy_weight); + const packed_float3 pass_diffuse_weight = + (bounce == 0) ? packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval)) : + INTEGRATOR_STATE(state, path, pass_diffuse_weight); + const packed_float3 pass_glossy_weight = (bounce == 0) ? + packed_float3( + bsdf_eval_pass_glossy_weight(&bsdf_eval)) : + INTEGRATOR_STATE(state, path, pass_glossy_weight); INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight; INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight; } diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h index cc47557d580..712c22357b8 100644 --- a/intern/cycles/kernel/integrator/shade_volume.h +++ b/intern/cycles/kernel/integrator/shade_volume.h @@ -27,8 +27,6 @@ #include "kernel/light/light.h" #include "kernel/light/sample.h" -#include "kernel/sample/mis.h" - CCL_NAMESPACE_BEGIN #ifdef __VOLUME__ @@ -78,9 +76,8 @@ ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg, ccl_private ShaderData *ccl_restrict sd, ccl_private float3 *ccl_restrict extinction) { - shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, [=](const int i) { - return integrator_state_read_shadow_volume_stack(state, i); - }); + VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i)) + shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass); if (!(sd->flag & SD_EXTINCTION)) { return false; @@ -98,9 +95,8 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg, ccl_private VolumeShaderCoefficients *coeff) { const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag); - shader_eval_volume<false>(kg, state, sd, path_flag, [=](const int i) { - return integrator_state_read_volume_stack(state, i); - }); + VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i)) + shader_eval_volume<false>(kg, state, sd, path_flag, volume_read_lambda_pass); if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) { return false; @@ -772,7 +768,7 @@ ccl_device_forceinline void integrate_volume_direct_light( const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval); if (ls->shader & SHADER_USE_MIS) { - float mis_weight = power_heuristic(ls->pdf, phase_pdf); + float mis_weight = light_sample_mis_weight_nee(kg, ls->pdf, phase_pdf); bsdf_eval_mul(&phase_eval, mis_weight); } @@ -805,9 +801,10 @@ ccl_device_forceinline void integrate_volume_direct_light( const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval); if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) { - const float3 pass_diffuse_weight = (bounce == 0) ? - one_float3() : - INTEGRATOR_STATE(state, path, pass_diffuse_weight); + const packed_float3 pass_diffuse_weight = (bounce == 0) ? + packed_float3(one_float3()) : + INTEGRATOR_STATE( + state, path, pass_diffuse_weight); INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight; INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3(); } @@ -932,8 +929,8 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg, VOLUME_SAMPLE_DISTANCE; /* Step through volume. */ - const float step_size = volume_stack_step_size( - kg, [=](const int i) { return integrator_state_read_volume_stack(state, i); }); + VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i)) + const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass); /* TODO: expensive to zero closures? */ VolumeIntegrateResult result = {}; diff --git a/intern/cycles/kernel/integrator/shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h index 667ab88c8c4..625a429d3db 100644 --- a/intern/cycles/kernel/integrator/shadow_state_template.h +++ b/intern/cycles/kernel/integrator/shadow_state_template.h @@ -40,15 +40,15 @@ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_T /* enum PathRayFlag */ KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING) /* Throughput. */ -KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING) /* Throughput for shadow pass. */ KERNEL_STRUCT_MEMBER(shadow_path, - float3, + packed_float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE) /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */ -KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES) -KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES) +KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES) +KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES) /* Number of intersections found by ray-tracing. */ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_END(shadow_path) @@ -56,8 +56,8 @@ KERNEL_STRUCT_END(shadow_path) /********************************** Shadow Ray *******************************/ KERNEL_STRUCT_BEGIN(shadow_ray) -KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING) diff --git a/intern/cycles/kernel/integrator/state_template.h b/intern/cycles/kernel/integrator/state_template.h index 3299f973713..bd18a7498a3 100644 --- a/intern/cycles/kernel/integrator/state_template.h +++ b/intern/cycles/kernel/integrator/state_template.h @@ -59,12 +59,12 @@ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING) /* Continuation probability for path termination. */ KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING) /* Throughput. */ -KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING) /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */ -KERNEL_STRUCT_MEMBER(path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES) -KERNEL_STRUCT_MEMBER(path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES) +KERNEL_STRUCT_MEMBER(path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES) +KERNEL_STRUCT_MEMBER(path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES) /* Denoising. */ -KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING) +KERNEL_STRUCT_MEMBER(path, packed_float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING) /* Shader sorting. */ /* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */ KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING) @@ -73,8 +73,8 @@ KERNEL_STRUCT_END(path) /************************************** Ray ***********************************/ KERNEL_STRUCT_BEGIN(ray) -KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING) @@ -96,10 +96,10 @@ KERNEL_STRUCT_END(isect) /*************** Subsurface closure state for subsurface kernel ***************/ KERNEL_STRUCT_BEGIN(subsurface) -KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE) -KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER(subsurface, packed_float3, albedo, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER(subsurface, packed_float3, radius, KERNEL_FEATURE_SUBSURFACE) KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE) -KERNEL_STRUCT_MEMBER(subsurface, float3, Ng, KERNEL_FEATURE_SUBSURFACE) +KERNEL_STRUCT_MEMBER(subsurface, packed_float3, Ng, KERNEL_FEATURE_SUBSURFACE) KERNEL_STRUCT_END(subsurface) /********************************** Volume Stack ******************************/ diff --git a/intern/cycles/kernel/integrator/volume_stack.h b/intern/cycles/kernel/integrator/volume_stack.h index cf69826ffff..ea3fa901e2d 100644 --- a/intern/cycles/kernel/integrator/volume_stack.h +++ b/intern/cycles/kernel/integrator/volume_stack.h @@ -18,6 +18,14 @@ CCL_NAMESPACE_BEGIN +/* Volumetric read/write lambda functions - default implementations */ +#ifndef VOLUME_READ_LAMBDA +# define VOLUME_READ_LAMBDA(function_call) \ + auto volume_read_lambda_pass = [=](const int i) { return function_call; }; +# define VOLUME_WRITE_LAMBDA(function_call) \ + auto volume_write_lambda_pass = [=](const int i, VolumeStack entry) { function_call; }; +#endif + /* Volume Stack * * This is an array of object/shared ID's that the current segment of the path @@ -88,26 +96,18 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg, IntegratorState state, ccl_private const ShaderData *sd) { - volume_stack_enter_exit( - kg, - sd, - [=](const int i) { return integrator_state_read_volume_stack(state, i); }, - [=](const int i, const VolumeStack entry) { - integrator_state_write_volume_stack(state, i, entry); - }); + VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i)) + VOLUME_WRITE_LAMBDA(integrator_state_write_volume_stack(state, i, entry)) + volume_stack_enter_exit(kg, sd, volume_read_lambda_pass, volume_write_lambda_pass); } ccl_device void shadow_volume_stack_enter_exit(KernelGlobals kg, IntegratorShadowState state, ccl_private const ShaderData *sd) { - volume_stack_enter_exit( - kg, - sd, - [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); }, - [=](const int i, const VolumeStack entry) { - integrator_state_write_shadow_volume_stack(state, i, entry); - }); + VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i)) + VOLUME_WRITE_LAMBDA(integrator_state_write_shadow_volume_stack(state, i, entry)) + volume_stack_enter_exit(kg, sd, volume_read_lambda_pass, volume_write_lambda_pass); } /* Clean stack after the last bounce. diff --git a/intern/cycles/kernel/light/light.h b/intern/cycles/kernel/light/light.h index 97dca936552..e0a9f1c57f5 100644 --- a/intern/cycles/kernel/light/light.h +++ b/intern/cycles/kernel/light/light.h @@ -676,19 +676,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals kg, ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B); /* calculate intersection with the planar triangle */ - if (!ray_triangle_intersect(P, - ls->D, - FLT_MAX, -#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - (ssef *)V, -#else - V[0], - V[1], - V[2], -#endif - &ls->u, - &ls->v, - &ls->t)) { + if (!ray_triangle_intersect(P, ls->D, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) { ls->pdf = 0.0f; return; } diff --git a/intern/cycles/kernel/light/sample.h b/intern/cycles/kernel/light/sample.h index 6b643a95250..ff5d43ed8cd 100644 --- a/intern/cycles/kernel/light/sample.h +++ b/intern/cycles/kernel/light/sample.h @@ -22,6 +22,7 @@ #include "kernel/light/light.h" #include "kernel/sample/mapping.h" +#include "kernel/sample/mis.h" CCL_NAMESPACE_BEGIN @@ -268,4 +269,36 @@ ccl_device_inline void light_sample_to_volume_shadow_ray( shadow_ray_setup(sd, ls, P, ray); } +ccl_device_inline float light_sample_mis_weight_forward(KernelGlobals kg, + const float forward_pdf, + const float nee_pdf) +{ +#ifdef WITH_CYCLES_DEBUG + if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_FORWARD) { + return 1.0f; + } + else if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_NEE) { + return 0.0f; + } + else +#endif + return power_heuristic(forward_pdf, nee_pdf); +} + +ccl_device_inline float light_sample_mis_weight_nee(KernelGlobals kg, + const float nee_pdf, + const float forward_pdf) +{ +#ifdef WITH_CYCLES_DEBUG + if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_FORWARD) { + return 0.0f; + } + else if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_NEE) { + return 1.0f; + } + else +#endif + return power_heuristic(nee_pdf, forward_pdf); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt index f226c95766f..90e16bd70d4 100644 --- a/intern/cycles/kernel/osl/CMakeLists.txt +++ b/intern/cycles/kernel/osl/CMakeLists.txt @@ -55,7 +55,7 @@ if(APPLE) # Disable allocation warning on macOS prior to 10.14: the OSLRenderServices # contains member which is 64 bytes aligned (cache inside of OIIO's # unordered_map_concurrent). This is not something what the SDK supportsm, but - # since we take care of allocations ourselves is is OK to ignore the + # since we take care of allocations ourselves is OK to ignore the # diagnostic message. string(APPEND CMAKE_CXX_FLAGS " -faligned-allocation") endif() diff --git a/intern/cycles/kernel/sample/lcg.h b/intern/cycles/kernel/sample/lcg.h index 92cfff639b4..e8c4915813e 100644 --- a/intern/cycles/kernel/sample/lcg.h +++ b/intern/cycles/kernel/sample/lcg.h @@ -19,14 +19,16 @@ CCL_NAMESPACE_BEGIN /* Linear Congruential Generator */ -ccl_device uint lcg_step_uint(uint *rng) +/* This is templated to handle multiple address spaces on Metal. */ +template<class T> ccl_device uint lcg_step_uint(T rng) { /* implicit mod 2^32 */ *rng = (1103515245 * (*rng) + 12345); return *rng; } -ccl_device float lcg_step_float(uint *rng) +/* This is templated to handle multiple address spaces on Metal. */ +template<class T> ccl_device float lcg_step_float(T rng) { /* implicit mod 2^32 */ *rng = (1103515245 * (*rng) + 12345); diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h index 0c27992c7f6..adc8493badd 100644 --- a/intern/cycles/kernel/sample/pattern.h +++ b/intern/cycles/kernel/sample/pattern.h @@ -163,18 +163,7 @@ ccl_device_inline bool sample_is_even(int pattern, int sample) /* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al. * We can use this to get divide sample sequence into two classes for easier variance * estimation. */ -#if defined(__GNUC__) && !defined(__KERNEL_GPU__) - return __builtin_popcount(sample & 0xaaaaaaaa) & 1; -#elif defined(__NVCC__) - return __popc(sample & 0xaaaaaaaa) & 1; -#else - /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */ - int i = sample & 0xaaaaaaaa; - i = i - ((i >> 1) & 0x55555555); - i = (i & 0x33333333) + ((i >> 2) & 0x33333333); - i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; - return i & 1; -#endif + return popcount(uint(sample) & 0xaaaaaaaa) & 1; } else { /* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */ diff --git a/intern/cycles/kernel/svm/math_util.h b/intern/cycles/kernel/svm/math_util.h index b2e539cdd1f..20817cd0fd3 100644 --- a/intern/cycles/kernel/svm/math_util.h +++ b/intern/cycles/kernel/svm/math_util.h @@ -212,33 +212,6 @@ ccl_device float3 svm_math_blackbody_color(float t) * which is enough to get the same 8 bit/channel color. */ - const float blackbody_table_r[6][3] = { - {2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f}, - {3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f}, - {4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f}, - {4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f}, - {4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f}, - {3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f}, - }; - - const float blackbody_table_g[6][3] = { - {-7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f}, - {-1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f}, - {-1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f}, - {-1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f}, - {-1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f}, - {-5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f}, - }; - - const float blackbody_table_b[6][4] = { - {0.0f, 0.0f, 0.0f, 0.0f}, /* zeros should be optimized by compiler */ - {0.0f, 0.0f, 0.0f, 0.0f}, - {0.0f, 0.0f, 0.0f, 0.0f}, - {-2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f}, - {-2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f}, - {6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f}, - }; - if (t >= 12000.0f) { return make_float3(0.826270103f, 0.994478524f, 1.56626022f); } diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 62ba5bf04e3..ce32e1a520f 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -220,7 +220,7 @@ CCL_NAMESPACE_BEGIN template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState> ccl_device void svm_eval_nodes(KernelGlobals kg, ConstIntegratorGenericState state, - ShaderData *sd, + ccl_private ShaderData *sd, ccl_global float *render_buffer, uint32_t path_flag) { diff --git a/intern/cycles/kernel/svm/wavelength.h b/intern/cycles/kernel/svm/wavelength.h index 28fd172abc7..6e25224243f 100644 --- a/intern/cycles/kernel/svm/wavelength.h +++ b/intern/cycles/kernel/svm/wavelength.h @@ -42,41 +42,6 @@ ccl_device_noinline void svm_node_wavelength(KernelGlobals kg, uint wavelength, uint color_out) { - // CIE colour matching functions xBar, yBar, and zBar for - // wavelengths from 380 through 780 nanometers, every 5 - // nanometers. For a wavelength lambda in this range: - // cie_colour_match[(lambda - 380) / 5][0] = xBar - // cie_colour_match[(lambda - 380) / 5][1] = yBar - // cie_colour_match[(lambda - 380) / 5][2] = zBar - const float cie_colour_match[81][3] = { - {0.0014f, 0.0000f, 0.0065f}, {0.0022f, 0.0001f, 0.0105f}, {0.0042f, 0.0001f, 0.0201f}, - {0.0076f, 0.0002f, 0.0362f}, {0.0143f, 0.0004f, 0.0679f}, {0.0232f, 0.0006f, 0.1102f}, - {0.0435f, 0.0012f, 0.2074f}, {0.0776f, 0.0022f, 0.3713f}, {0.1344f, 0.0040f, 0.6456f}, - {0.2148f, 0.0073f, 1.0391f}, {0.2839f, 0.0116f, 1.3856f}, {0.3285f, 0.0168f, 1.6230f}, - {0.3483f, 0.0230f, 1.7471f}, {0.3481f, 0.0298f, 1.7826f}, {0.3362f, 0.0380f, 1.7721f}, - {0.3187f, 0.0480f, 1.7441f}, {0.2908f, 0.0600f, 1.6692f}, {0.2511f, 0.0739f, 1.5281f}, - {0.1954f, 0.0910f, 1.2876f}, {0.1421f, 0.1126f, 1.0419f}, {0.0956f, 0.1390f, 0.8130f}, - {0.0580f, 0.1693f, 0.6162f}, {0.0320f, 0.2080f, 0.4652f}, {0.0147f, 0.2586f, 0.3533f}, - {0.0049f, 0.3230f, 0.2720f}, {0.0024f, 0.4073f, 0.2123f}, {0.0093f, 0.5030f, 0.1582f}, - {0.0291f, 0.6082f, 0.1117f}, {0.0633f, 0.7100f, 0.0782f}, {0.1096f, 0.7932f, 0.0573f}, - {0.1655f, 0.8620f, 0.0422f}, {0.2257f, 0.9149f, 0.0298f}, {0.2904f, 0.9540f, 0.0203f}, - {0.3597f, 0.9803f, 0.0134f}, {0.4334f, 0.9950f, 0.0087f}, {0.5121f, 1.0000f, 0.0057f}, - {0.5945f, 0.9950f, 0.0039f}, {0.6784f, 0.9786f, 0.0027f}, {0.7621f, 0.9520f, 0.0021f}, - {0.8425f, 0.9154f, 0.0018f}, {0.9163f, 0.8700f, 0.0017f}, {0.9786f, 0.8163f, 0.0014f}, - {1.0263f, 0.7570f, 0.0011f}, {1.0567f, 0.6949f, 0.0010f}, {1.0622f, 0.6310f, 0.0008f}, - {1.0456f, 0.5668f, 0.0006f}, {1.0026f, 0.5030f, 0.0003f}, {0.9384f, 0.4412f, 0.0002f}, - {0.8544f, 0.3810f, 0.0002f}, {0.7514f, 0.3210f, 0.0001f}, {0.6424f, 0.2650f, 0.0000f}, - {0.5419f, 0.2170f, 0.0000f}, {0.4479f, 0.1750f, 0.0000f}, {0.3608f, 0.1382f, 0.0000f}, - {0.2835f, 0.1070f, 0.0000f}, {0.2187f, 0.0816f, 0.0000f}, {0.1649f, 0.0610f, 0.0000f}, - {0.1212f, 0.0446f, 0.0000f}, {0.0874f, 0.0320f, 0.0000f}, {0.0636f, 0.0232f, 0.0000f}, - {0.0468f, 0.0170f, 0.0000f}, {0.0329f, 0.0119f, 0.0000f}, {0.0227f, 0.0082f, 0.0000f}, - {0.0158f, 0.0057f, 0.0000f}, {0.0114f, 0.0041f, 0.0000f}, {0.0081f, 0.0029f, 0.0000f}, - {0.0058f, 0.0021f, 0.0000f}, {0.0041f, 0.0015f, 0.0000f}, {0.0029f, 0.0010f, 0.0000f}, - {0.0020f, 0.0007f, 0.0000f}, {0.0014f, 0.0005f, 0.0000f}, {0.0010f, 0.0004f, 0.0000f}, - {0.0007f, 0.0002f, 0.0000f}, {0.0005f, 0.0002f, 0.0000f}, {0.0003f, 0.0001f, 0.0000f}, - {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, - {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}}; - float lambda_nm = stack_load_float(stack, wavelength); float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f); // scaled 0..80 int i = float_to_int(ii); diff --git a/intern/cycles/kernel/tables.h b/intern/cycles/kernel/tables.h new file mode 100644 index 00000000000..768033d4ffe --- /dev/null +++ b/intern/cycles/kernel/tables.h @@ -0,0 +1,76 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* clang-format off */ + +ccl_inline_constant float blackbody_table_r[][3] = { + {2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f}, + {3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f}, + {4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f}, + {4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f}, + {4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f}, + {3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f} +}; + +ccl_inline_constant float blackbody_table_g[][3] = { + {-7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f}, + {-1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f}, + {-1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f}, + {-1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f}, + {-1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f}, + {-5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f} +}; + +ccl_inline_constant float blackbody_table_b[][4] = { + {0.0f, 0.0f, 0.0f, 0.0f}, /* zeros should be optimized by compiler */ + {0.0f, 0.0f, 0.0f, 0.0f}, + {0.0f, 0.0f, 0.0f, 0.0f}, + {-2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f}, + {-2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f}, + {6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f} +}; + +ccl_inline_constant float cie_colour_match[][3] = { + {0.0014f, 0.0000f, 0.0065f}, {0.0022f, 0.0001f, 0.0105f}, {0.0042f, 0.0001f, 0.0201f}, + {0.0076f, 0.0002f, 0.0362f}, {0.0143f, 0.0004f, 0.0679f}, {0.0232f, 0.0006f, 0.1102f}, + {0.0435f, 0.0012f, 0.2074f}, {0.0776f, 0.0022f, 0.3713f}, {0.1344f, 0.0040f, 0.6456f}, + {0.2148f, 0.0073f, 1.0391f}, {0.2839f, 0.0116f, 1.3856f}, {0.3285f, 0.0168f, 1.6230f}, + {0.3483f, 0.0230f, 1.7471f}, {0.3481f, 0.0298f, 1.7826f}, {0.3362f, 0.0380f, 1.7721f}, + {0.3187f, 0.0480f, 1.7441f}, {0.2908f, 0.0600f, 1.6692f}, {0.2511f, 0.0739f, 1.5281f}, + {0.1954f, 0.0910f, 1.2876f}, {0.1421f, 0.1126f, 1.0419f}, {0.0956f, 0.1390f, 0.8130f}, + {0.0580f, 0.1693f, 0.6162f}, {0.0320f, 0.2080f, 0.4652f}, {0.0147f, 0.2586f, 0.3533f}, + {0.0049f, 0.3230f, 0.2720f}, {0.0024f, 0.4073f, 0.2123f}, {0.0093f, 0.5030f, 0.1582f}, + {0.0291f, 0.6082f, 0.1117f}, {0.0633f, 0.7100f, 0.0782f}, {0.1096f, 0.7932f, 0.0573f}, + {0.1655f, 0.8620f, 0.0422f}, {0.2257f, 0.9149f, 0.0298f}, {0.2904f, 0.9540f, 0.0203f}, + {0.3597f, 0.9803f, 0.0134f}, {0.4334f, 0.9950f, 0.0087f}, {0.5121f, 1.0000f, 0.0057f}, + {0.5945f, 0.9950f, 0.0039f}, {0.6784f, 0.9786f, 0.0027f}, {0.7621f, 0.9520f, 0.0021f}, + {0.8425f, 0.9154f, 0.0018f}, {0.9163f, 0.8700f, 0.0017f}, {0.9786f, 0.8163f, 0.0014f}, + {1.0263f, 0.7570f, 0.0011f}, {1.0567f, 0.6949f, 0.0010f}, {1.0622f, 0.6310f, 0.0008f}, + {1.0456f, 0.5668f, 0.0006f}, {1.0026f, 0.5030f, 0.0003f}, {0.9384f, 0.4412f, 0.0002f}, + {0.8544f, 0.3810f, 0.0002f}, {0.7514f, 0.3210f, 0.0001f}, {0.6424f, 0.2650f, 0.0000f}, + {0.5419f, 0.2170f, 0.0000f}, {0.4479f, 0.1750f, 0.0000f}, {0.3608f, 0.1382f, 0.0000f}, + {0.2835f, 0.1070f, 0.0000f}, {0.2187f, 0.0816f, 0.0000f}, {0.1649f, 0.0610f, 0.0000f}, + {0.1212f, 0.0446f, 0.0000f}, {0.0874f, 0.0320f, 0.0000f}, {0.0636f, 0.0232f, 0.0000f}, + {0.0468f, 0.0170f, 0.0000f}, {0.0329f, 0.0119f, 0.0000f}, {0.0227f, 0.0082f, 0.0000f}, + {0.0158f, 0.0057f, 0.0000f}, {0.0114f, 0.0041f, 0.0000f}, {0.0081f, 0.0029f, 0.0000f}, + {0.0058f, 0.0021f, 0.0000f}, {0.0041f, 0.0015f, 0.0000f}, {0.0029f, 0.0010f, 0.0000f}, + {0.0020f, 0.0007f, 0.0000f}, {0.0014f, 0.0005f, 0.0000f}, {0.0010f, 0.0004f, 0.0000f}, + {0.0007f, 0.0002f, 0.0000f}, {0.0005f, 0.0002f, 0.0000f}, {0.0003f, 0.0001f, 0.0000f}, + {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, + {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f} +}; + +/* clang-format on */
\ No newline at end of file diff --git a/intern/cycles/kernel/textures.h b/intern/cycles/kernel/textures.h index 464ecb183cb..2e3ae29a19a 100644 --- a/intern/cycles/kernel/textures.h +++ b/intern/cycles/kernel/textures.h @@ -40,11 +40,11 @@ KERNEL_TEX(DecomposedTransform, __camera_motion) /* triangles */ KERNEL_TEX(uint, __tri_shader) -KERNEL_TEX(float4, __tri_vnormal) +KERNEL_TEX(packed_float3, __tri_vnormal) KERNEL_TEX(uint4, __tri_vindex) KERNEL_TEX(uint, __tri_patch) KERNEL_TEX(float2, __tri_patch_uv) -KERNEL_TEX(float4, __tri_verts) +KERNEL_TEX(packed_float3, __tri_verts) /* curves */ KERNEL_TEX(KernelCurve, __curves) @@ -58,7 +58,8 @@ KERNEL_TEX(uint, __patches) KERNEL_TEX(uint4, __attributes_map) KERNEL_TEX(float, __attributes_float) KERNEL_TEX(float2, __attributes_float2) -KERNEL_TEX(float4, __attributes_float3) +KERNEL_TEX(packed_float3, __attributes_float3) +KERNEL_TEX(float4, __attributes_float4) KERNEL_TEX(uchar4, __attributes_uchar4) /* lights */ diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 2f6cadf7496..4a730dbfaaa 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -36,13 +36,6 @@ # define __KERNEL_CPU__ #endif -/* TODO(sergey): This is only to make it possible to include this header - * from outside of the kernel. but this could be done somewhat cleaner? - */ -#ifndef ccl_addr_space -# define ccl_addr_space -#endif - CCL_NAMESPACE_BEGIN /* Constants */ @@ -489,6 +482,16 @@ enum PanoramaType { PANORAMA_NUM_TYPES, }; +/* Direct Light Sampling */ + +enum DirectLightSamplingType { + DIRECT_LIGHT_SAMPLING_MIS = 0, + DIRECT_LIGHT_SAMPLING_FORWARD = 1, + DIRECT_LIGHT_SAMPLING_NEE = 2, + + DIRECT_LIGHT_SAMPLING_NUM, +}; + /* Differential */ typedef struct differential3 { @@ -1201,8 +1204,11 @@ typedef struct KernelIntegrator { /* Closure filter. */ int filter_closures; + /* MIS debuging */ + int direct_light_sampling_type; + /* padding */ - int pad1, pad2, pad3; + int pad1, pad2; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); @@ -1426,6 +1432,7 @@ typedef struct KernelWorkTile { uint start_sample; uint num_samples; + uint sample_offset; int offset; uint stride; diff --git a/intern/cycles/scene/attribute.cpp b/intern/cycles/scene/attribute.cpp index 3401eea307f..6d15f3325f7 100644 --- a/intern/cycles/scene/attribute.cpp +++ b/intern/cycles/scene/attribute.cpp @@ -404,6 +404,10 @@ AttrKernelDataType Attribute::kernel_type(const Attribute &attr) return AttrKernelDataType::FLOAT2; } + if (attr.type == TypeFloat4 || attr.type == TypeRGBA || attr.type == TypeDesc::TypeMatrix) { + return AttrKernelDataType::FLOAT4; + } + return AttrKernelDataType::FLOAT3; } @@ -585,7 +589,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name) attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE); break; case ATTR_STD_MOTION_VERTEX_POSITION: - attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE_KEY_MOTION); + attr = add(name, TypeDesc::TypeFloat4, ATTR_ELEMENT_CURVE_KEY_MOTION); break; case ATTR_STD_CURVE_INTERCEPT: attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY); diff --git a/intern/cycles/scene/attribute.h b/intern/cycles/scene/attribute.h index 4a25a900c14..612a0b7c80d 100644 --- a/intern/cycles/scene/attribute.h +++ b/intern/cycles/scene/attribute.h @@ -47,12 +47,7 @@ struct Transform; * * The values of this enumeration are also used as flags to detect changes in AttributeSet. */ -enum AttrKernelDataType { - FLOAT = 0, - FLOAT2 = 1, - FLOAT3 = 2, - UCHAR4 = 3, -}; +enum AttrKernelDataType { FLOAT = 0, FLOAT2 = 1, FLOAT3 = 2, FLOAT4 = 3, UCHAR4 = 4, NUM = 5 }; /* Attribute * diff --git a/intern/cycles/scene/geometry.cpp b/intern/cycles/scene/geometry.cpp index 8a3fc522d22..bf426fc49f6 100644 --- a/intern/cycles/scene/geometry.cpp +++ b/intern/cycles/scene/geometry.cpp @@ -551,6 +551,7 @@ static void update_attribute_element_size(Geometry *geom, size_t *attr_float_size, size_t *attr_float2_size, size_t *attr_float3_size, + size_t *attr_float4_size, size_t *attr_uchar4_size) { if (mattr) { @@ -569,7 +570,10 @@ static void update_attribute_element_size(Geometry *geom, *attr_float2_size += size; } else if (mattr->type == TypeDesc::TypeMatrix) { - *attr_float3_size += size * 4; + *attr_float4_size += size * 4; + } + else if (mattr->type == TypeFloat4 || mattr->type == TypeRGBA) { + *attr_float4_size += size; } else { *attr_float3_size += size; @@ -582,8 +586,10 @@ void GeometryManager::update_attribute_element_offset(Geometry *geom, size_t &attr_float_offset, device_vector<float2> &attr_float2, size_t &attr_float2_offset, - device_vector<float4> &attr_float3, + device_vector<packed_float3> &attr_float3, size_t &attr_float3_offset, + device_vector<float4> &attr_float4, + size_t &attr_float4_offset, device_vector<uchar4> &attr_uchar4, size_t &attr_uchar4_offset, Attribute *mattr, @@ -646,18 +652,30 @@ void GeometryManager::update_attribute_element_offset(Geometry *geom, } else if (mattr->type == TypeDesc::TypeMatrix) { Transform *tfm = mattr->data_transform(); - offset = attr_float3_offset; + offset = attr_float4_offset; - assert(attr_float3.size() >= offset + size * 3); + assert(attr_float4.size() >= offset + size * 3); if (mattr->modified) { for (size_t k = 0; k < size * 3; k++) { - attr_float3[offset + k] = (&tfm->x)[k]; + attr_float4[offset + k] = (&tfm->x)[k]; } } - attr_float3_offset += size * 3; + attr_float4_offset += size * 3; } - else { + else if (mattr->type == TypeFloat4 || mattr->type == TypeRGBA) { float4 *data = mattr->data_float4(); + offset = attr_float4_offset; + + assert(attr_float4.size() >= offset + size); + if (mattr->modified) { + for (size_t k = 0; k < size; k++) { + attr_float4[offset + k] = data[k]; + } + } + attr_float4_offset += size; + } + else { + float3 *data = mattr->data_float3(); offset = attr_float3_offset; assert(attr_float3.size() >= offset + size); @@ -783,6 +801,7 @@ void GeometryManager::device_update_attributes(Device *device, size_t attr_float_size = 0; size_t attr_float2_size = 0; size_t attr_float3_size = 0; + size_t attr_float4_size = 0; size_t attr_uchar4_size = 0; for (size_t i = 0; i < scene->geometry.size(); i++) { @@ -797,6 +816,7 @@ void GeometryManager::device_update_attributes(Device *device, &attr_float_size, &attr_float2_size, &attr_float3_size, + &attr_float4_size, &attr_uchar4_size); if (geom->is_mesh()) { @@ -809,6 +829,7 @@ void GeometryManager::device_update_attributes(Device *device, &attr_float_size, &attr_float2_size, &attr_float3_size, + &attr_float4_size, &attr_uchar4_size); } } @@ -824,6 +845,7 @@ void GeometryManager::device_update_attributes(Device *device, &attr_float_size, &attr_float2_size, &attr_float3_size, + &attr_float4_size, &attr_uchar4_size); } } @@ -831,19 +853,22 @@ void GeometryManager::device_update_attributes(Device *device, dscene->attributes_float.alloc(attr_float_size); dscene->attributes_float2.alloc(attr_float2_size); dscene->attributes_float3.alloc(attr_float3_size); + dscene->attributes_float4.alloc(attr_float4_size); dscene->attributes_uchar4.alloc(attr_uchar4_size); /* The order of those flags needs to match that of AttrKernelDataType. */ - const bool attributes_need_realloc[4] = { + const bool attributes_need_realloc[AttrKernelDataType::NUM] = { dscene->attributes_float.need_realloc(), dscene->attributes_float2.need_realloc(), dscene->attributes_float3.need_realloc(), + dscene->attributes_float4.need_realloc(), dscene->attributes_uchar4.need_realloc(), }; size_t attr_float_offset = 0; size_t attr_float2_offset = 0; size_t attr_float3_offset = 0; + size_t attr_float4_offset = 0; size_t attr_uchar4_offset = 0; /* Fill in attributes. */ @@ -868,6 +893,8 @@ void GeometryManager::device_update_attributes(Device *device, attr_float2_offset, dscene->attributes_float3, attr_float3_offset, + dscene->attributes_float4, + attr_float4_offset, dscene->attributes_uchar4, attr_uchar4_offset, attr, @@ -891,6 +918,8 @@ void GeometryManager::device_update_attributes(Device *device, attr_float2_offset, dscene->attributes_float3, attr_float3_offset, + dscene->attributes_float4, + attr_float4_offset, dscene->attributes_uchar4, attr_uchar4_offset, subd_attr, @@ -923,6 +952,8 @@ void GeometryManager::device_update_attributes(Device *device, attr_float2_offset, dscene->attributes_float3, attr_float3_offset, + dscene->attributes_float4, + attr_float4_offset, dscene->attributes_uchar4, attr_uchar4_offset, attr, @@ -954,6 +985,7 @@ void GeometryManager::device_update_attributes(Device *device, dscene->attributes_float.copy_to_device_if_modified(); dscene->attributes_float2.copy_to_device_if_modified(); dscene->attributes_float3.copy_to_device_if_modified(); + dscene->attributes_float4.copy_to_device_if_modified(); dscene->attributes_uchar4.copy_to_device_if_modified(); if (progress.get_cancel()) @@ -1080,9 +1112,9 @@ void GeometryManager::device_update_mesh(Device *, /* normals */ progress.set_status("Updating Mesh", "Computing normals"); - float4 *tri_verts = dscene->tri_verts.alloc(tri_size * 3); + packed_float3 *tri_verts = dscene->tri_verts.alloc(tri_size * 3); uint *tri_shader = dscene->tri_shader.alloc(tri_size); - float4 *vnormal = dscene->tri_vnormal.alloc(vert_size); + packed_float3 *vnormal = dscene->tri_vnormal.alloc(vert_size); uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size); uint *tri_patch = dscene->tri_patch.alloc(tri_size); float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size); @@ -1293,18 +1325,21 @@ enum { ATTR_FLOAT_MODIFIED = (1 << 2), ATTR_FLOAT2_MODIFIED = (1 << 3), ATTR_FLOAT3_MODIFIED = (1 << 4), - ATTR_UCHAR4_MODIFIED = (1 << 5), + ATTR_FLOAT4_MODIFIED = (1 << 5), + ATTR_UCHAR4_MODIFIED = (1 << 6), - CURVE_DATA_NEED_REALLOC = (1 << 6), - MESH_DATA_NEED_REALLOC = (1 << 7), + CURVE_DATA_NEED_REALLOC = (1 << 7), + MESH_DATA_NEED_REALLOC = (1 << 8), - ATTR_FLOAT_NEEDS_REALLOC = (1 << 8), - ATTR_FLOAT2_NEEDS_REALLOC = (1 << 9), - ATTR_FLOAT3_NEEDS_REALLOC = (1 << 10), - ATTR_UCHAR4_NEEDS_REALLOC = (1 << 11), + ATTR_FLOAT_NEEDS_REALLOC = (1 << 9), + ATTR_FLOAT2_NEEDS_REALLOC = (1 << 10), + ATTR_FLOAT3_NEEDS_REALLOC = (1 << 11), + ATTR_FLOAT4_NEEDS_REALLOC = (1 << 12), + ATTR_UCHAR4_NEEDS_REALLOC = (1 << 13), ATTRS_NEED_REALLOC = (ATTR_FLOAT_NEEDS_REALLOC | ATTR_FLOAT2_NEEDS_REALLOC | - ATTR_FLOAT3_NEEDS_REALLOC | ATTR_UCHAR4_NEEDS_REALLOC), + ATTR_FLOAT3_NEEDS_REALLOC | ATTR_FLOAT4_NEEDS_REALLOC | + ATTR_UCHAR4_NEEDS_REALLOC), DEVICE_MESH_DATA_NEEDS_REALLOC = (MESH_DATA_NEED_REALLOC | ATTRS_NEED_REALLOC), DEVICE_CURVE_DATA_NEEDS_REALLOC = (CURVE_DATA_NEED_REALLOC | ATTRS_NEED_REALLOC), }; @@ -1332,10 +1367,17 @@ static void update_device_flags_attribute(uint32_t &device_update_flags, device_update_flags |= ATTR_FLOAT3_MODIFIED; break; } + case AttrKernelDataType::FLOAT4: { + device_update_flags |= ATTR_FLOAT4_MODIFIED; + break; + } case AttrKernelDataType::UCHAR4: { device_update_flags |= ATTR_UCHAR4_MODIFIED; break; } + case AttrKernelDataType::NUM: { + break; + } } } } @@ -1352,6 +1394,9 @@ static void update_attribute_realloc_flags(uint32_t &device_update_flags, if (attributes.modified(AttrKernelDataType::FLOAT3)) { device_update_flags |= ATTR_FLOAT3_NEEDS_REALLOC; } + if (attributes.modified(AttrKernelDataType::FLOAT4)) { + device_update_flags |= ATTR_FLOAT4_NEEDS_REALLOC; + } if (attributes.modified(AttrKernelDataType::UCHAR4)) { device_update_flags |= ATTR_UCHAR4_NEEDS_REALLOC; } @@ -1553,6 +1598,14 @@ void GeometryManager::device_update_preprocess(Device *device, Scene *scene, Pro dscene->attributes_float3.tag_modified(); } + if (device_update_flags & ATTR_FLOAT4_NEEDS_REALLOC) { + dscene->attributes_map.tag_realloc(); + dscene->attributes_float4.tag_realloc(); + } + else if (device_update_flags & ATTR_FLOAT4_MODIFIED) { + dscene->attributes_float4.tag_modified(); + } + if (device_update_flags & ATTR_UCHAR4_NEEDS_REALLOC) { dscene->attributes_map.tag_realloc(); dscene->attributes_uchar4.tag_realloc(); @@ -2014,6 +2067,7 @@ void GeometryManager::device_update(Device *device, dscene->attributes_float.clear_modified(); dscene->attributes_float2.clear_modified(); dscene->attributes_float3.clear_modified(); + dscene->attributes_float4.clear_modified(); dscene->attributes_uchar4.clear_modified(); } @@ -2041,6 +2095,7 @@ void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool forc dscene->attributes_float.free_if_need_realloc(force_free); dscene->attributes_float2.free_if_need_realloc(force_free); dscene->attributes_float3.free_if_need_realloc(force_free); + dscene->attributes_float4.free_if_need_realloc(force_free); dscene->attributes_uchar4.free_if_need_realloc(force_free); /* Signal for shaders like displacement not to do ray tracing. */ diff --git a/intern/cycles/scene/geometry.h b/intern/cycles/scene/geometry.h index 335bcdcd0b7..91799d7fde8 100644 --- a/intern/cycles/scene/geometry.h +++ b/intern/cycles/scene/geometry.h @@ -257,8 +257,10 @@ class GeometryManager { size_t &attr_float_offset, device_vector<float2> &attr_float2, size_t &attr_float2_offset, - device_vector<float4> &attr_float3, + device_vector<packed_float3> &attr_float3, size_t &attr_float3_offset, + device_vector<float4> &attr_float4, + size_t &attr_float4_offset, device_vector<uchar4> &attr_uchar4, size_t &attr_uchar4_offset, Attribute *mattr, diff --git a/intern/cycles/scene/image.cpp b/intern/cycles/scene/image.cpp index 80091e01b8c..8bb2d87fd1e 100644 --- a/intern/cycles/scene/image.cpp +++ b/intern/cycles/scene/image.cpp @@ -303,7 +303,6 @@ ImageManager::ImageManager(const DeviceInfo &info) animation_frame = 0; /* Set image limits */ - features.has_half_float = info.has_half_images; features.has_nanovdb = info.has_nanovdb; } @@ -357,8 +356,6 @@ void ImageManager::load_image_metadata(Image *img) metadata.detect_colorspace(); - assert(features.has_half_float || - (metadata.type != IMAGE_DATA_TYPE_HALF4 && metadata.type != IMAGE_DATA_TYPE_HALF)); assert(features.has_nanovdb || (metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT || metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3)); diff --git a/intern/cycles/scene/image.h b/intern/cycles/scene/image.h index 6447b028ebf..7cf09dd6d8f 100644 --- a/intern/cycles/scene/image.h +++ b/intern/cycles/scene/image.h @@ -100,7 +100,6 @@ class ImageMetaData { /* Information about supported features that Image loaders can use. */ class ImageDeviceFeatures { public: - bool has_half_float; bool has_nanovdb; }; diff --git a/intern/cycles/scene/image_oiio.cpp b/intern/cycles/scene/image_oiio.cpp index feafae035a1..4cea7fbfb01 100644 --- a/intern/cycles/scene/image_oiio.cpp +++ b/intern/cycles/scene/image_oiio.cpp @@ -30,7 +30,8 @@ OIIOImageLoader::~OIIOImageLoader() { } -bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMetaData &metadata) +bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures & /*features*/, + ImageMetaData &metadata) { /* Perform preliminary checks, with meaningful logging. */ if (!path_exists(filepath.string())) { @@ -76,7 +77,7 @@ bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMe } /* check if it's half float */ - if (spec.format == TypeDesc::HALF && features.has_half_float) { + if (spec.format == TypeDesc::HALF) { is_half = true; } diff --git a/intern/cycles/scene/integrator.cpp b/intern/cycles/scene/integrator.cpp index a97833a8d32..31e645c1f3a 100644 --- a/intern/cycles/scene/integrator.cpp +++ b/intern/cycles/scene/integrator.cpp @@ -54,6 +54,18 @@ NODE_DEFINE(Integrator) SOCKET_INT(transparent_min_bounce, "Transparent Min Bounce", 0); SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7); +#ifdef WITH_CYCLES_DEBUG + static NodeEnum direct_light_sampling_type_enum; + direct_light_sampling_type_enum.insert("multiple_importance_sampling", + DIRECT_LIGHT_SAMPLING_MIS); + direct_light_sampling_type_enum.insert("forward_path_tracing", DIRECT_LIGHT_SAMPLING_FORWARD); + direct_light_sampling_type_enum.insert("next_event_estimation", DIRECT_LIGHT_SAMPLING_NEE); + SOCKET_ENUM(direct_light_sampling_type, + "Direct Light Sampling Type", + direct_light_sampling_type_enum, + DIRECT_LIGHT_SAMPLING_MIS); +#endif + SOCKET_INT(ao_bounces, "AO Bounces", 0); SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f); SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX); @@ -173,6 +185,12 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->ao_bounces_factor = ao_factor; kintegrator->ao_additive_factor = ao_additive_factor; +#ifdef WITH_CYCLES_DEBUG + kintegrator->direct_light_sampling_type = direct_light_sampling_type; +#else + kintegrator->direct_light_sampling_type = DIRECT_LIGHT_SAMPLING_MIS; +#endif + /* Transparent Shadows * We only need to enable transparent shadows, if we actually have * transparent shaders in the scene. Otherwise we can disable it diff --git a/intern/cycles/scene/integrator.h b/intern/cycles/scene/integrator.h index 464d96ca01b..52f1b296a20 100644 --- a/intern/cycles/scene/integrator.h +++ b/intern/cycles/scene/integrator.h @@ -41,6 +41,10 @@ class Integrator : public Node { NODE_SOCKET_API(int, max_transmission_bounce) NODE_SOCKET_API(int, max_volume_bounce) +#ifdef WITH_CYCLES_DEBUG + NODE_SOCKET_API(DirectLightSamplingType, direct_light_sampling_type) +#endif + NODE_SOCKET_API(int, transparent_min_bounce) NODE_SOCKET_API(int, transparent_max_bounce) diff --git a/intern/cycles/scene/mesh.cpp b/intern/cycles/scene/mesh.cpp index f47dab30869..e65b8462e34 100644 --- a/intern/cycles/scene/mesh.cpp +++ b/intern/cycles/scene/mesh.cpp @@ -707,7 +707,7 @@ void Mesh::pack_shaders(Scene *scene, uint *tri_shader) } } -void Mesh::pack_normals(float4 *vnormal) +void Mesh::pack_normals(packed_float3 *vnormal) { Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL); if (attr_vN == NULL) { @@ -727,11 +727,14 @@ void Mesh::pack_normals(float4 *vnormal) if (do_transform) vNi = safe_normalize(transform_direction(&ntfm, vNi)); - vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f); + vnormal[i] = make_float3(vNi.x, vNi.y, vNi.z); } } -void Mesh::pack_verts(float4 *tri_verts, uint4 *tri_vindex, uint *tri_patch, float2 *tri_patch_uv) +void Mesh::pack_verts(packed_float3 *tri_verts, + uint4 *tri_vindex, + uint *tri_patch, + float2 *tri_patch_uv) { size_t verts_size = verts.size(); @@ -752,9 +755,9 @@ void Mesh::pack_verts(float4 *tri_verts, uint4 *tri_vindex, uint *tri_patch, flo tri_patch[i] = (!get_num_subd_faces()) ? -1 : (triangle_patch[i] * 8 + patch_offset); - tri_verts[i * 3] = float3_to_float4(verts[t.v[0]]); - tri_verts[i * 3 + 1] = float3_to_float4(verts[t.v[1]]); - tri_verts[i * 3 + 2] = float3_to_float4(verts[t.v[2]]); + tri_verts[i * 3] = verts[t.v[0]]; + tri_verts[i * 3 + 1] = verts[t.v[1]]; + tri_verts[i * 3 + 2] = verts[t.v[2]]; } } diff --git a/intern/cycles/scene/mesh.h b/intern/cycles/scene/mesh.h index d13b3003164..254672d0620 100644 --- a/intern/cycles/scene/mesh.h +++ b/intern/cycles/scene/mesh.h @@ -223,8 +223,11 @@ class Mesh : public Geometry { void get_uv_tiles(ustring map, unordered_set<int> &tiles) override; void pack_shaders(Scene *scene, uint *shader); - void pack_normals(float4 *vnormal); - void pack_verts(float4 *tri_verts, uint4 *tri_vindex, uint *tri_patch, float2 *tri_patch_uv); + void pack_normals(packed_float3 *vnormal); + void pack_verts(packed_float3 *tri_verts, + uint4 *tri_vindex, + uint *tri_patch, + float2 *tri_patch_uv); void pack_patches(uint *patch_data); PrimitiveType primitive_type() const override; diff --git a/intern/cycles/scene/mesh_subdivision.cpp b/intern/cycles/scene/mesh_subdivision.cpp index a0c0bc68f8b..35f15cfafbc 100644 --- a/intern/cycles/scene/mesh_subdivision.cpp +++ b/intern/cycles/scene/mesh_subdivision.cpp @@ -331,7 +331,8 @@ struct OsdPatch : Patch { void eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v) { - const Far::PatchTable::PatchHandle *handle = osd_data->patch_map->FindPatch(patch_index, u, v); + const Far::PatchTable::PatchHandle *handle = osd_data->patch_map->FindPatch( + patch_index, (double)u, (double)v); assert(handle); float p_weights[20], du_weights[20], dv_weights[20]; diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp index ef0ee0c6625..4230abe9a1b 100644 --- a/intern/cycles/scene/scene.cpp +++ b/intern/cycles/scene/scene.cpp @@ -74,6 +74,7 @@ DeviceScene::DeviceScene(Device *device) attributes_float(device, "__attributes_float", MEM_GLOBAL), attributes_float2(device, "__attributes_float2", MEM_GLOBAL), attributes_float3(device, "__attributes_float3", MEM_GLOBAL), + attributes_float4(device, "__attributes_float4", MEM_GLOBAL), attributes_uchar4(device, "__attributes_uchar4", MEM_GLOBAL), light_distribution(device, "__light_distribution", MEM_GLOBAL), lights(device, "__lights", MEM_GLOBAL), diff --git a/intern/cycles/scene/scene.h b/intern/cycles/scene/scene.h index fa7fc54602a..4af05349dd3 100644 --- a/intern/cycles/scene/scene.h +++ b/intern/cycles/scene/scene.h @@ -81,9 +81,9 @@ class DeviceScene { device_vector<float2> prim_time; /* mesh */ - device_vector<float4> tri_verts; + device_vector<packed_float3> tri_verts; device_vector<uint> tri_shader; - device_vector<float4> tri_vnormal; + device_vector<packed_float3> tri_vnormal; device_vector<uint4> tri_vindex; device_vector<uint> tri_patch; device_vector<float2> tri_patch_uv; @@ -108,7 +108,8 @@ class DeviceScene { device_vector<uint4> attributes_map; device_vector<float> attributes_float; device_vector<float2> attributes_float2; - device_vector<float4> attributes_float3; + device_vector<packed_float3> attributes_float3; + device_vector<float4> attributes_float4; device_vector<uchar4> attributes_uchar4; /* lights */ diff --git a/intern/cycles/scene/shader_nodes.cpp b/intern/cycles/scene/shader_nodes.cpp index 8a9ef56b0ae..8c20807a52b 100644 --- a/intern/cycles/scene/shader_nodes.cpp +++ b/intern/cycles/scene/shader_nodes.cpp @@ -34,6 +34,8 @@ #include "util/log.h" #include "util/transform.h" +#include "kernel/tables.h" + #include "kernel/svm/color_util.h" #include "kernel/svm/mapping_util.h" #include "kernel/svm/math_util.h" diff --git a/intern/cycles/session/session.cpp b/intern/cycles/session/session.cpp index 530baa8cafb..af5c6b3f1fd 100644 --- a/intern/cycles/session/session.cpp +++ b/intern/cycles/session/session.cpp @@ -262,6 +262,7 @@ RenderWork Session::run_update_for_next_iteration() } render_scheduler_.set_num_samples(params.samples); + render_scheduler_.set_start_sample(params.sample_offset); render_scheduler_.set_time_limit(params.time_limit); while (have_tiles) { @@ -409,7 +410,7 @@ void Session::do_delayed_reset() /* Tile and work scheduling. */ tile_manager_.reset_scheduling(buffer_params_, get_effective_tile_size()); - render_scheduler_.reset(buffer_params_, params.samples); + render_scheduler_.reset(buffer_params_, params.samples, params.sample_offset); /* Passes. */ /* When multiple tiles are used SAMPLE_COUNT pass is used to keep track of possible partial diff --git a/intern/cycles/session/session.h b/intern/cycles/session/session.h index 1ec0c6e9bb1..3f73593f008 100644 --- a/intern/cycles/session/session.h +++ b/intern/cycles/session/session.h @@ -54,6 +54,7 @@ class SessionParams { bool experimental; int samples; + int sample_offset; int pixel_size; int threads; @@ -75,6 +76,7 @@ class SessionParams { experimental = false; samples = 1024; + sample_offset = 0; pixel_size = 1; threads = 0; time_limit = 0.0; diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h index faba411c769..afc3fd019df 100644 --- a/intern/cycles/util/atomic.h +++ b/intern/cycles/util/atomic.h @@ -63,6 +63,62 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest, # endif /* __KERNEL_CUDA__ */ +# ifdef __KERNEL_METAL__ + +// global address space versions +ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_source, + const float operand) +{ + volatile ccl_global atomic_int *source = (ccl_global atomic_int *)_source; + union { + int int_value; + float float_value; + } new_value, prev_value; + prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed); + do { + new_value.float_value = prev_value.float_value + operand; + } while (!atomic_compare_exchange_weak_explicit(source, + &prev_value.int_value, + new_value.int_value, + memory_order_relaxed, + memory_order_relaxed)); + + return new_value.float_value; +} + +# define atomic_fetch_and_add_uint32(p, x) \ + atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed) +# define atomic_fetch_and_sub_uint32(p, x) \ + atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed) +# define atomic_fetch_and_inc_uint32(p) \ + atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed) +# define atomic_fetch_and_dec_uint32(p) \ + atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed) +# define atomic_fetch_and_or_uint32(p, x) \ + atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed) + +ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest, + const float old_val, + const float new_val) +{ + int prev_value; + prev_value = __float_as_int(old_val); + atomic_compare_exchange_weak_explicit((ccl_global atomic_int *)dest, + &prev_value, + __float_as_int(new_val), + memory_order_relaxed, + memory_order_relaxed); + return __int_as_float(prev_value); +} + +# define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed) +# define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed) + +# define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup +# define ccl_barrier(flags) threadgroup_barrier(flags) + +# endif /* __KERNEL_METAL__ */ + #endif /* __KERNEL_GPU__ */ #endif /* __UTIL_ATOMIC_H__ */ diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp index 7d5b6d4e54e..717e55a2c9a 100644 --- a/intern/cycles/util/debug.cpp +++ b/intern/cycles/util/debug.cpp @@ -64,6 +64,11 @@ DebugFlags::HIP::HIP() : adaptive_compile(false) reset(); } +DebugFlags::Metal::Metal() : adaptive_compile(false) +{ + reset(); +} + void DebugFlags::CUDA::reset() { if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) @@ -76,6 +81,12 @@ void DebugFlags::HIP::reset() adaptive_compile = true; } +void DebugFlags::Metal::reset() +{ + if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL) + adaptive_compile = true; +} + DebugFlags::OptiX::OptiX() { reset(); @@ -97,6 +108,7 @@ void DebugFlags::reset() cpu.reset(); cuda.reset(); optix.reset(); + metal.reset(); } CCL_NAMESPACE_END diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h index 548c67600e5..1e431fde68a 100644 --- a/intern/cycles/util/debug.h +++ b/intern/cycles/util/debug.h @@ -116,6 +116,17 @@ class DebugFlags { bool use_debug; }; + /* Descriptor of Metal feature-set to be used. */ + struct Metal { + Metal(); + + /* Reset flags to their defaults. */ + void reset(); + + /* Whether adaptive feature based runtime compile is enabled or not.*/ + bool adaptive_compile; + }; + /* Get instance of debug flags registry. */ static DebugFlags &get() { @@ -138,6 +149,9 @@ class DebugFlags { /* Requested HIP flags. */ HIP hip; + /* Requested Metal flags. */ + Metal metal; + private: DebugFlags(); diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h index a778bef52b2..a2e8d83adb7 100644 --- a/intern/cycles/util/defines.h +++ b/intern/cycles/util/defines.h @@ -44,6 +44,7 @@ # if defined(_WIN32) && !defined(FREE_WINDOWS) # define ccl_device_inline static __forceinline # define ccl_device_forceinline static __forceinline +# define ccl_device_inline_method __forceinline # define ccl_align(...) __declspec(align(__VA_ARGS__)) # ifdef __KERNEL_64_BIT__ # define ccl_try_align(...) __declspec(align(__VA_ARGS__)) @@ -58,6 +59,7 @@ # else /* _WIN32 && !FREE_WINDOWS */ # define ccl_device_inline static inline __attribute__((always_inline)) # define ccl_device_forceinline static inline __attribute__((always_inline)) +# define ccl_device_inline_method __attribute__((always_inline)) # define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) # ifndef FREE_WINDOWS64 # define __forceinline inline __attribute__((always_inline)) @@ -70,7 +72,7 @@ /* Address spaces for GPU. */ # define ccl_global -# define ccl_static_constant static const +# define ccl_inline_constant inline constexpr # define ccl_constant const # define ccl_private diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h index 016975e3c25..555f17259bd 100644 --- a/intern/cycles/util/half.h +++ b/intern/cycles/util/half.h @@ -28,8 +28,27 @@ CCL_NAMESPACE_BEGIN /* Half Floats */ +#if defined(__KERNEL_METAL__) + +ccl_device_inline float half_to_float(half h_in) +{ + float f; + union { + half h; + uint16_t s; + } val; + val.h = h_in; + + *((ccl_private int *)&f) = ((val.s & 0x8000) << 16) | (((val.s & 0x7c00) + 0x1C000) << 13) | + ((val.s & 0x03FF) << 13); + + return f; +} + +#else + /* CUDA has its own half data type, no need to define then */ -#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) +# if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from * unsigned shorts. */ class half { @@ -53,11 +72,12 @@ class half { private: unsigned short v; }; -#endif +# endif struct half4 { half x, y, z, w; }; +#endif /* Conversion to/from half float for image textures * @@ -66,7 +86,9 @@ struct half4 { ccl_device_inline half float_to_half_image(float f) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) +#if defined(__KERNEL_METAL__) + return half(f); +#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __float2half(f); #else const uint u = __float_as_uint(f); @@ -92,7 +114,9 @@ ccl_device_inline half float_to_half_image(float f) ccl_device_inline float half_to_float_image(half h) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) +#if defined(__KERNEL_METAL__) + return half_to_float(h); +#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __half2float(h); #else const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13); @@ -125,7 +149,9 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h) ccl_device_inline half float_to_half_display(const float f) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) +#if defined(__KERNEL_METAL__) + return half(f); +#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) return __float2half(f); #else const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f); diff --git a/intern/cycles/util/ies.cpp b/intern/cycles/util/ies.cpp index 5e879478df5..e924d660407 100644 --- a/intern/cycles/util/ies.cpp +++ b/intern/cycles/util/ies.cpp @@ -23,10 +23,10 @@ CCL_NAMESPACE_BEGIN -// NOTE: For some reason gcc-7.2 does not instantiate this versio of allocator -// gere (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8. +// NOTE: For some reason gcc-7.2 does not instantiate this version of the +// allocator here (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8. // -// TODO(sergey): Get to the root of this issue, or confirm this i a compiler +// TODO(sergey): Get to the root of this issue, or confirm this is a compiler // issue. template class GuardedAllocator<char>; diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h index e4c7df6e44a..6cfeb1aa917 100644 --- a/intern/cycles/util/math.h +++ b/intern/cycles/util/math.h @@ -30,9 +30,11 @@ # include <hip/hip_vector_types.h> #endif -#include <float.h> -#include <math.h> -#include <stdio.h> +#if !defined(__KERNEL_METAL__) +# include <float.h> +# include <math.h> +# include <stdio.h> +#endif /* !defined(__KERNEL_METAL__) */ #include "util/types.h" @@ -174,6 +176,7 @@ ccl_device_inline float max4(float a, float b, float c, float d) return max(max(a, b), max(c, d)); } +#if !defined(__KERNEL_METAL__) /* Int/Float conversion */ ccl_device_inline int as_int(uint i) @@ -206,7 +209,7 @@ ccl_device_inline uint as_uint(float f) return u.i; } -#ifndef __HIP__ +# ifndef __HIP__ ccl_device_inline int __float_as_int(float f) { union { @@ -246,28 +249,33 @@ ccl_device_inline float __uint_as_float(uint i) u.i = i; return u.f; } -#endif +# endif ccl_device_inline int4 __float4_as_int4(float4 f) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(f.m128)); -#else +# else return make_int4( __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w)); -#endif +# endif } ccl_device_inline float4 __int4_as_float4(int4 i) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_castsi128_ps(i.m128)); -#else +# else return make_float4( __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w)); -#endif +# endif } +#endif /* !defined(__KERNEL_METAL__) */ +#if defined(__KERNEL_METAL__) +# define isnan_safe(v) isnan(v) +# define isfinite_safe(v) isfinite(v) +#else template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr) { return ((uint64_t)ptr) & 0xFFFFFFFF; @@ -311,12 +319,14 @@ ccl_device_inline bool isfinite_safe(float f) unsigned int x = __float_as_uint(f); return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u); } +#endif ccl_device_inline float ensure_finite(float v) { return isfinite_safe(v) ? v : 0.0f; } +#if !defined(__KERNEL_METAL__) ccl_device_inline int clamp(int a, int mn, int mx) { return min(max(a, mn), mx); @@ -346,15 +356,17 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x) return result; } -#ifndef __KERNEL_CUDA__ +#endif /* !defined(__KERNEL_METAL__) */ + +#if defined(__KERNEL_CUDA__) ccl_device_inline float saturatef(float a) { - return clamp(a, 0.0f, 1.0f); + return __saturatef(a); } -#else +#elif !defined(__KERNEL_METAL__) ccl_device_inline float saturatef(float a) { - return __saturatef(a); + return clamp(a, 0.0f, 1.0f); } #endif /* __KERNEL_CUDA__ */ @@ -491,6 +503,7 @@ CCL_NAMESPACE_END CCL_NAMESPACE_BEGIN +#if !defined(__KERNEL_METAL__) /* Interpolation */ template<class A, class B> A lerp(const A &a, const A &b, const B &t) @@ -498,6 +511,8 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t) return (A)(a * ((B)1 - t) + b * t); } +#endif /* __KERNEL_METAL__ */ + /* Triangle */ ccl_device_inline float triangle_area(ccl_private const float3 &v1, @@ -627,7 +642,11 @@ ccl_device_inline float safe_sqrtf(float f) ccl_device_inline float inversesqrtf(float f) { +#if defined(__KERNEL_METAL__) + return (f > 0.0f) ? rsqrt(f) : 0.0f; +#else return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f; +#endif } ccl_device float safe_asinf(float a) @@ -715,10 +734,30 @@ ccl_device float bits_to_01(uint bits) return bits * (1.0f / (float)0xFFFFFFFF); } +#if !defined(__KERNEL_GPU__) +# if defined(__GNUC__) +# define popcount(x) __builtin_popcount(x) +# else +ccl_device_inline uint popcount(uint x) +{ + /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */ + uint i = x & 0xaaaaaaaa; + i = i - ((i >> 1) & 0x55555555); + i = (i & 0x33333333) + ((i >> 2) & 0x33333333); + i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; + return i & 1; +} +# endif +#elif !defined(__KERNEL_METAL__) +# define popcount(x) __popc(x) +#endif + ccl_device_inline uint count_leading_zeros(uint x) { #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __clz(x); +#elif defined(__KERNEL_METAL__) + return clz(x); #else assert(x != 0); # ifdef _MSC_VER @@ -735,6 +774,8 @@ ccl_device_inline uint count_trailing_zeros(uint x) { #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return (__ffs(x) - 1); +#elif defined(__KERNEL_METAL__) + return ctz(x); #else assert(x != 0); # ifdef _MSC_VER @@ -751,6 +792,8 @@ ccl_device_inline uint find_first_set(uint x) { #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __ffs(x); +#elif defined(__KERNEL_METAL__) + return (x != 0) ? ctz(x) + 1 : 0; #else # ifdef _MSC_VER return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0; @@ -801,7 +844,7 @@ ccl_device_inline float2 map_to_sphere(const float3 co) * https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ */ -ccl_device_inline float compare_floats(float a, float b, float abs_diff, int ulp_diff) +ccl_device_inline bool compare_floats(float a, float b, float abs_diff, int ulp_diff) { if (fabsf(a - b) < abs_diff) { return true; @@ -849,6 +892,8 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x) return x; #elif defined(__KERNEL_CUDA__) return __brev(x); +#elif defined(__KERNEL_METAL__) + return reverse_bits(x); #elif __has_builtin(__builtin_bitreverse32) return __builtin_bitreverse32(x); #else diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h index 87141d5bc37..8ff75c6c20a 100644 --- a/intern/cycles/util/math_float2.h +++ b/intern/cycles/util/math_float2.h @@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline float2 operator-(const float2 &a); ccl_device_inline float2 operator*(const float2 &a, const float2 &b); ccl_device_inline float2 operator*(const float2 &a, float f); @@ -63,6 +64,7 @@ ccl_device_inline float2 fabs(const float2 &a); ccl_device_inline float2 as_float2(const float4 &a); ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t); ccl_device_inline float2 floor(const float2 &a); +#endif /* !__KERNEL_METAL__ */ ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b); @@ -80,6 +82,7 @@ ccl_device_inline float2 one_float2() return make_float2(1.0f, 1.0f); } +#if !defined(__KERNEL_METAL__) ccl_device_inline float2 operator-(const float2 &a) { return make_float2(-a.x, -a.y); @@ -259,6 +262,8 @@ ccl_device_inline float2 floor(const float2 &a) return make_float2(floorf(a.x), floorf(a.y)); } +#endif /* !__KERNEL_METAL__ */ + ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b) { return (b != 0.0f) ? a / b : zero_float2(); diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h index 81550c5d03c..1a0213f2a6d 100644 --- a/intern/cycles/util/math_float3.h +++ b/intern/cycles/util/math_float3.h @@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline float3 operator-(const float3 &a); ccl_device_inline float3 operator*(const float3 &a, const float3 &b); ccl_device_inline float3 operator*(const float3 &a, const float f); @@ -62,19 +63,20 @@ ccl_device_inline float3 rcp(const float3 &a); ccl_device_inline float3 sqrt(const float3 &a); ccl_device_inline float3 floor(const float3 &a); ccl_device_inline float3 ceil(const float3 &a); +ccl_device_inline float3 reflect(const float3 incident, const float3 normal); +#endif /* !defined(__KERNEL_METAL__) */ ccl_device_inline float min3(float3 a); ccl_device_inline float max3(float3 a); ccl_device_inline float len(const float3 a); ccl_device_inline float len_squared(const float3 a); -ccl_device_inline float3 reflect(const float3 incident, const float3 normal); ccl_device_inline float3 project(const float3 v, const float3 v_proj); ccl_device_inline float3 saturate3(float3 a); ccl_device_inline float3 safe_normalize(const float3 a); -ccl_device_inline float3 normalize_len(const float3 a, float *t); -ccl_device_inline float3 safe_normalize_len(const float3 a, float *t); +ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t); +ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t); ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b); ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b); ccl_device_inline float3 interp(float3 a, float3 b, float t); @@ -103,49 +105,58 @@ ccl_device_inline float3 one_float3() return make_float3(1.0f, 1.0f, 1.0f); } +#if defined(__KERNEL_METAL__) + +ccl_device_inline float3 rcp(float3 a) +{ + return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z); +} + +#else + ccl_device_inline float3 operator-(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); -#else +# else return make_float3(-a.x, -a.y, -a.z); -#endif +# endif } ccl_device_inline float3 operator*(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_mul_ps(a.m128, b.m128)); -#else +# else return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); -#endif +# endif } ccl_device_inline float3 operator*(const float3 &a, const float f) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f))); -#else +# else return make_float3(a.x * f, a.y * f, a.z * f); -#endif +# endif } ccl_device_inline float3 operator*(const float f, const float3 &a) { -#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE__) return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); -#else +# else return make_float3(a.x * f, a.y * f, a.z * f); -#endif +# endif } ccl_device_inline float3 operator/(const float f, const float3 &a) { -#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE__) return float3(_mm_div_ps(_mm_set1_ps(f), a.m128)); -#else +# else return make_float3(f / a.x, f / a.y, f / a.z); -#endif +# endif } ccl_device_inline float3 operator/(const float3 &a, const float f) @@ -156,11 +167,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f) ccl_device_inline float3 operator/(const float3 &a, const float3 &b) { -#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE__) return float3(_mm_div_ps(a.m128, b.m128)); -#else +# else return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); -#endif +# endif } ccl_device_inline float3 operator+(const float3 &a, const float f) @@ -170,11 +181,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f) ccl_device_inline float3 operator+(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_add_ps(a.m128, b.m128)); -#else +# else return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); -#endif +# endif } ccl_device_inline float3 operator-(const float3 &a, const float f) @@ -184,11 +195,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f) ccl_device_inline float3 operator-(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_sub_ps(a.m128, b.m128)); -#else +# else return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); -#endif +# endif } ccl_device_inline float3 operator+=(float3 &a, const float3 &b) @@ -222,13 +233,39 @@ ccl_device_inline float3 operator/=(float3 &a, float f) return a = a * invf; } +#if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__)) +ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b) +{ + a = float3(a) * b; + return a; +} + +ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f) +{ + a = float3(a) * f; + return a; +} + +ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b) +{ + a = float3(a) / b; + return a; +} + +ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f) +{ + a = float3(a) / f; + return a; +} +#endif + ccl_device_inline bool operator==(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; -#else +# else return (a.x == b.x && a.y == b.y && a.z == b.z); -#endif +# endif } ccl_device_inline bool operator!=(const float3 &a, const float3 &b) @@ -243,20 +280,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b) ccl_device_inline float dot(const float3 &a, const float3 &b) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); -#else +# else return a.x * b.x + a.y * b.y + a.z * b.z; -#endif +# endif } ccl_device_inline float dot_xy(const float3 &a, const float3 &b) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b)); -#else +# else return a.x * b.x + a.y * b.y; -#endif +# endif } ccl_device_inline float3 cross(const float3 &a, const float3 &b) @@ -267,30 +304,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b) ccl_device_inline float3 normalize(const float3 &a) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); return float3(_mm_div_ps(a.m128, norm)); -#else +# else return a / len(a); -#endif +# endif } ccl_device_inline float3 min(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_min_ps(a.m128, b.m128)); -#else +# else return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif +# endif } ccl_device_inline float3 max(const float3 &a, const float3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_max_ps(a.m128, b.m128)); -#else +# else return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif +# endif } ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx) @@ -300,43 +337,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 & ccl_device_inline float3 fabs(const float3 &a) { -#ifdef __KERNEL_SSE__ -# ifdef __KERNEL_NEON__ +# ifdef __KERNEL_SSE__ +# ifdef __KERNEL_NEON__ return float3(vabsq_f32(a.m128)); -# else +# else __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); return float3(_mm_and_ps(a.m128, mask)); -# endif -#else +# endif +# else return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); -#endif +# endif } ccl_device_inline float3 sqrt(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_sqrt_ps(a)); -#else +# else return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z)); -#endif +# endif } ccl_device_inline float3 floor(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_floor_ps(a)); -#else +# else return make_float3(floorf(a.x), floorf(a.y), floorf(a.z)); -#endif +# endif } ccl_device_inline float3 ceil(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float3(_mm_ceil_ps(a)); -#else +# else return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z)); -#endif +# endif } ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t) @@ -346,13 +383,14 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t) ccl_device_inline float3 rcp(const float3 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ /* Don't use _mm_rcp_ps due to poor precision. */ return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); -#else +# else return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z); -#endif +# endif } +#endif /* !__KERNEL_METAL__ */ ccl_device_inline float min3(float3 a) { @@ -378,6 +416,7 @@ ccl_device_inline float len_squared(const float3 a) return dot(a, a); } +#if !defined(__KERNEL_METAL__) ccl_device_inline float3 reflect(const float3 incident, const float3 normal) { float3 unit_normal = normalize(normal); @@ -399,6 +438,7 @@ ccl_device_inline float3 faceforward(const float3 vector, { return (dot(reference, incident) < 0.0f) ? vector : -vector; } +#endif ccl_device_inline float3 project(const float3 v, const float3 v_proj) { @@ -479,7 +519,11 @@ ccl_device_inline float average(const float3 a) ccl_device_inline bool isequal_float3(const float3 a, const float3 b) { +#if defined(__KERNEL_METAL__) + return all(a == b); +#else return a == b; +#endif } ccl_device_inline float3 pow3(float3 v, float e) diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h index c76959ee7ff..1203a10cca4 100644 --- a/intern/cycles/util/math_float4.h +++ b/intern/cycles/util/math_float4.h @@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline float4 operator-(const float4 &a); ccl_device_inline float4 operator*(const float4 &a, const float4 &b); ccl_device_inline float4 operator*(const float4 &a, float f); @@ -65,6 +66,7 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 & ccl_device_inline float4 fabs(const float4 &a); ccl_device_inline float4 floor(const float4 &a); ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t); +#endif /* !__KERNEL_METAL__*/ ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b); @@ -110,32 +112,33 @@ ccl_device_inline float4 one_float4() return make_float4(1.0f, 1.0f, 1.0f, 1.0f); } +#if !defined(__KERNEL_METAL__) ccl_device_inline float4 operator-(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); return float4(_mm_xor_ps(a.m128, mask)); -#else +# else return make_float4(-a.x, -a.y, -a.z, -a.w); -#endif +# endif } ccl_device_inline float4 operator*(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_mul_ps(a.m128, b.m128)); -#else +# else return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); -#endif +# endif } ccl_device_inline float4 operator*(const float4 &a, float f) { -#if defined(__KERNEL_SSE__) +# if defined(__KERNEL_SSE__) return a * make_float4(f); -#else +# else return make_float4(a.x * f, a.y * f, a.z * f, a.w * f); -#endif +# endif } ccl_device_inline float4 operator*(float f, const float4 &a) @@ -150,11 +153,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f) ccl_device_inline float4 operator/(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_div_ps(a.m128, b.m128)); -#else +# else return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); -#endif +# endif } ccl_device_inline float4 operator+(const float4 &a, const float f) @@ -164,11 +167,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f) ccl_device_inline float4 operator+(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_add_ps(a.m128, b.m128)); -#else +# else return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -#endif +# endif } ccl_device_inline float4 operator-(const float4 &a, const float f) @@ -178,11 +181,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f) ccl_device_inline float4 operator-(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_sub_ps(a.m128, b.m128)); -#else +# else return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); -#endif +# endif } ccl_device_inline float4 operator+=(float4 &a, const float4 &b) @@ -212,38 +215,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f) ccl_device_inline int4 operator<(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128))); -#else +# else return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); -#endif +# endif } ccl_device_inline int4 operator>=(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128))); -#else +# else return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); -#endif +# endif } ccl_device_inline int4 operator<=(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128))); -#else +# else return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); -#endif +# endif } ccl_device_inline bool operator==(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; -#else +# else return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); -#endif +# endif } ccl_device_inline float distance(const float4 &a, const float4 &b) @@ -253,16 +256,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b) ccl_device_inline float dot(const float4 &a, const float4 &b) { -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) +# if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) __m128 t = vmulq_f32(a, b); return vaddvq_f32(t); -# else +# else return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); -# endif -#else +# endif +# else return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w); -#endif +# endif } ccl_device_inline float len_squared(const float4 &a) @@ -272,21 +275,21 @@ ccl_device_inline float len_squared(const float4 &a) ccl_device_inline float4 rcp(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ /* Don't use _mm_rcp_ps due to poor precision. */ return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); -#else +# else return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w); -#endif +# endif } ccl_device_inline float4 sqrt(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_sqrt_ps(a.m128)); -#else +# else return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); -#endif +# endif } ccl_device_inline float4 sqr(const float4 &a) @@ -296,39 +299,39 @@ ccl_device_inline float4 sqr(const float4 &a) ccl_device_inline float4 cross(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) - (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b)); -#else +# else return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f); -#endif +# endif } ccl_device_inline bool is_zero(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return a == make_float4(0.0f); -#else +# else return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); -#endif +# endif } ccl_device_inline float4 reduce_add(const float4 &a) { -#if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) +# if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) return float4(vdupq_n_f32(vaddvq_f32(a))); -# elif defined(__KERNEL_SSE3__) +# elif defined(__KERNEL_SSE3__) float4 h(_mm_hadd_ps(a.m128, a.m128)); return float4(_mm_hadd_ps(h.m128, h.m128)); -# else +# else float4 h(shuffle<1, 0, 3, 2>(a) + a); return shuffle<2, 3, 0, 1>(h) + h; -# endif -#else +# endif +# else float sum = (a.x + a.y) + (a.z + a.w); return make_float4(sum, sum, sum, sum); -#endif +# endif } ccl_device_inline float average(const float4 &a) @@ -354,20 +357,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a) ccl_device_inline float4 min(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_min_ps(a.m128, b.m128)); -#else +# else return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -#endif +# endif } ccl_device_inline float4 max(const float4 &a, const float4 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_max_ps(a.m128, b.m128)); -#else +# else return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); -#endif +# endif } ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx) @@ -377,24 +380,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 & ccl_device_inline float4 fabs(const float4 &a) { -#if defined(__KERNEL_SSE__) -# if defined(__KERNEL_NEON__) +# if defined(__KERNEL_SSE__) +# if defined(__KERNEL_NEON__) return float4(vabsq_f32(a)); -# else +# else return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); -# endif -#else +# endif +# else return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); -#endif +# endif } ccl_device_inline float4 floor(const float4 &a) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return float4(_mm_floor_ps(a)); -#else +# else return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); -#endif +# endif } ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) @@ -402,6 +405,8 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t) return a + t * (b - a); } +#endif /* !__KERNEL_METAL__*/ + #ifdef __KERNEL_SSE__ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4 &b) diff --git a/intern/cycles/util/math_int2.h b/intern/cycles/util/math_int2.h index 5b04be92152..39dc3b28f11 100644 --- a/intern/cycles/util/math_int2.h +++ b/intern/cycles/util/math_int2.h @@ -27,17 +27,20 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline bool operator==(const int2 a, const int2 b); ccl_device_inline int2 operator+(const int2 &a, const int2 &b); ccl_device_inline int2 operator+=(int2 &a, const int2 &b); ccl_device_inline int2 operator-(const int2 &a, const int2 &b); ccl_device_inline int2 operator*(const int2 &a, const int2 &b); ccl_device_inline int2 operator/(const int2 &a, const int2 &b); +#endif /* !__KERNEL_METAL__ */ /******************************************************************************* * Definition. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline bool operator==(const int2 a, const int2 b) { return (a.x == b.x && a.y == b.y); @@ -67,6 +70,7 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b) { return make_int2(a.x / b.x, a.y / b.y); } +#endif /* !__KERNEL_METAL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h index 128f2cb53b8..a09c68ef49a 100644 --- a/intern/cycles/util/math_int3.h +++ b/intern/cycles/util/math_int3.h @@ -27,49 +27,52 @@ CCL_NAMESPACE_BEGIN * Declaration. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline int3 min(int3 a, int3 b); ccl_device_inline int3 max(int3 a, int3 b); ccl_device_inline int3 clamp(const int3 &a, int mn, int mx); ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx); +#endif /* !defined(__KERNEL_METAL__) */ /******************************************************************************* * Definition. */ +#if !defined(__KERNEL_METAL__) ccl_device_inline int3 min(int3 a, int3 b) { -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) +# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) return int3(_mm_min_epi32(a.m128, b.m128)); -#else +# else return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif +# endif } ccl_device_inline int3 max(int3 a, int3 b) { -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) +# if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) return int3(_mm_max_epi32(a.m128, b.m128)); -#else +# else return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif +# endif } ccl_device_inline int3 clamp(const int3 &a, int mn, int mx) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return min(max(a, make_int3(mn)), make_int3(mx)); -#else +# else return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); -#endif +# endif } ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return min(max(a, mn), make_int3(mx)); -#else +# else return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)); -#endif +# endif } ccl_device_inline bool operator==(const int3 &a, const int3 &b) @@ -89,21 +92,22 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b) ccl_device_inline int3 operator+(const int3 &a, const int3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int3(_mm_add_epi32(a.m128, b.m128)); -#else +# else return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); -#endif +# endif } ccl_device_inline int3 operator-(const int3 &a, const int3 &b) { -#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE__ return int3(_mm_sub_epi32(a.m128, b.m128)); -#else +# else return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); -#endif +# endif } +#endif /* !__KERNEL_METAL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h index 0c431a36afb..54ce3ab4b66 100644 --- a/intern/cycles/util/math_intersect.h +++ b/intern/cycles/util/math_intersect.h @@ -88,29 +88,16 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P, ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, float3 ray_dir, float ray_t, -#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - const ssef *ssef_verts, -#else const float3 tri_a, const float3 tri_b, const float3 tri_c, -#endif ccl_private float *isect_u, ccl_private float *isect_v, ccl_private float *isect_t) { -#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - typedef ssef float3; - const float3 tri_a(ssef_verts[0]); - const float3 tri_b(ssef_verts[1]); - const float3 tri_c(ssef_verts[2]); - const float3 P(ray_P); - const float3 dir(ray_dir); -#else -# define dot3(a, b) dot(a, b) +#define dot3(a, b) dot(a, b) const float3 P = ray_P; const float3 dir = ray_dir; -#endif /* Calculate vertices relative to ray origin. */ const float3 v0 = tri_c - P; @@ -123,43 +110,16 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, const float3 e2 = v1 - v2; /* Perform edge tests. */ -#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - const float3 crossU = cross(v2 + v0, e0); - const float3 crossV = cross(v0 + v1, e1); - const float3 crossW = cross(v1 + v2, e2); - - ssef crossX(crossU); - ssef crossY(crossV); - ssef crossZ(crossW); - ssef zero = _mm_setzero_ps(); - _MM_TRANSPOSE4_PS(crossX, crossY, crossZ, zero); - - const ssef dirX(ray_dir.x); - const ssef dirY(ray_dir.y); - const ssef dirZ(ray_dir.z); - - ssef UVWW = madd(crossX, dirX, madd(crossY, dirY, crossZ * dirZ)); -#else /* __KERNEL_SSE2__ */ const float U = dot(cross(v2 + v0, e0), ray_dir); const float V = dot(cross(v0 + v1, e1), ray_dir); const float W = dot(cross(v1 + v2, e2), ray_dir); -#endif /* __KERNEL_SSE2__ */ -#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - int uvw_sign = movemask(UVWW) & 0x7; - if (uvw_sign != 0) { - if (uvw_sign != 0x7) { - return false; - } - } -#else const float minUVW = min(U, min(V, W)); const float maxUVW = max(U, max(V, W)); if (minUVW < 0.0f && maxUVW > 0.0f) { return false; } -#endif /* Calculate geometry normal and denominator. */ const float3 Ng1 = cross(e1, e0); @@ -180,14 +140,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P, } const float inv_den = 1.0f / den; -#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) - UVWW *= inv_den; - _mm_store_ss(isect_u, UVWW); - _mm_store_ss(isect_v, shuffle<1, 1, 3, 3>(UVWW)); -#else *isect_u = U * inv_den; *isect_v = V * inv_den; -#endif *isect_t = T * inv_den; return true; diff --git a/intern/cycles/util/math_matrix.h b/intern/cycles/util/math_matrix.h index bff7ddb4cee..c1be71517e3 100644 --- a/intern/cycles/util/math_matrix.h +++ b/intern/cycles/util/math_matrix.h @@ -162,7 +162,7 @@ ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A, { for (int row = 0; row < n; row++) { for (int col = 0; col <= row; col++) { - MATHS(A, row, col, 1) += v[row] * v[col] * weight; + atomic_add_and_fetch_float(&MATHS(A, row, col, 1), v[row] * v[col] * weight); } } } diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp index 5704c4ef8ef..aad790482d5 100644 --- a/intern/cycles/util/path.cpp +++ b/intern/cycles/util/path.cpp @@ -313,7 +313,7 @@ static char *path_specials(const string &sub) if (env_shader_path != NULL && sub == "shader") { return env_shader_path; } - else if (env_shader_path != NULL && sub == "source") { + else if (env_source_path != NULL && sub == "source") { return env_source_path; } return NULL; @@ -541,7 +541,7 @@ static string path_make_compatible(const string &path) if ((path.size() >= 3) && (path[0] == DIR_SEP) && (path[1] == DIR_SEP)) { result = path_cleanup_unc(result); } - /* Make sure volume-only path ends up wit ha directory separator. */ + /* Make sure volume-only path ends up wit a directory separator. */ if (result.size() == 2 && result[1] == ':') { result += DIR_SEP; } diff --git a/intern/cycles/util/progress.h b/intern/cycles/util/progress.h index f2d80e49ab8..15bd26d34bf 100644 --- a/intern/cycles/util/progress.h +++ b/intern/cycles/util/progress.h @@ -207,7 +207,7 @@ class Progress { if (total_pixel_samples > 0) { return ((double)pixel_samples) / (double)total_pixel_samples; } - return 0.0f; + return 0.0; } void add_samples(uint64_t pixel_samples_, int tile_sample) diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h index ea5e78b54d2..fc496e55a0c 100644 --- a/intern/cycles/util/ssef.h +++ b/intern/cycles/util/ssef.h @@ -906,7 +906,7 @@ __forceinline void store4f_nt(void *ptr, const ssef &v) } //////////////////////////////////////////////////////////////////////////////// -/// Euclidian Space Operators +/// Euclidean Space Operators //////////////////////////////////////////////////////////////////////////////// __forceinline float dot(const ssef &a, const ssef &b) diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h index 7bfe747fcfb..1d78dfd1385 100644 --- a/intern/cycles/util/transform.h +++ b/intern/cycles/util/transform.h @@ -53,6 +53,15 @@ typedef struct DecomposedTransform { /* Functions */ +#ifdef __KERNEL_METAL__ +/* transform_point specialized for ccl_global */ +ccl_device_inline float3 transform_point(ccl_global const Transform *t, const float3 a) +{ + ccl_global const float3x3 &b(*(ccl_global const float3x3 *)t); + return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w); +} +#endif + ccl_device_inline float3 transform_point(ccl_private const Transform *t, const float3 a) { /* TODO(sergey): Disabled for now, causes crashes in certain cases. */ @@ -73,6 +82,9 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f tmp += w; return float3(tmp.m128); +#elif defined(__KERNEL_METAL__) + ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t); + return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w); #else float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z + t->x.w, a.x * t->y.x + a.y * t->y.y + a.z * t->y.z + t->y.w, @@ -99,6 +111,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con tmp = madd(shuffle<2>(aa), z, tmp); return float3(tmp.m128); +#elif defined(__KERNEL_METAL__) + ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t); + return (a * b).xyz; #else float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z, a.x * t->y.x + a.y * t->y.y + a.z * t->y.z, @@ -450,8 +465,8 @@ ccl_device_inline void transform_compose(ccl_private Transform *tfm, } /* Interpolate from array of decomposed transforms. */ -ccl_device void transform_motion_array_interpolate(Transform *tfm, - const DecomposedTransform *motion, +ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm, + ccl_global const DecomposedTransform *motion, uint numsteps, float time) { @@ -460,8 +475,8 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm, int step = min((int)(time * maxstep), maxstep - 1); float t = time * maxstep - step; - const DecomposedTransform *a = motion + step; - const DecomposedTransform *b = motion + step + 1; + ccl_global const DecomposedTransform *a = motion + step; + ccl_global const DecomposedTransform *b = motion + step + 1; /* Interpolate rotation, translation and scale. */ DecomposedTransform decomp; diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h index 697dc2b44ea..58a6d134819 100644 --- a/intern/cycles/util/types.h +++ b/intern/cycles/util/types.h @@ -17,7 +17,9 @@ #ifndef __UTIL_TYPES_H__ #define __UTIL_TYPES_H__ -#include <stdlib.h> +#if !defined(__KERNEL_METAL__) +# include <stdlib.h> +#endif /* Standard Integer Types */ diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h index f990367e7b8..cafcfebf526 100644 --- a/intern/cycles/util/types_float3.h +++ b/intern/cycles/util/types_float3.h @@ -55,6 +55,41 @@ ccl_device_inline float3 make_float3(float x, float y, float z); ccl_device_inline void print_float3(const char *label, const float3 &a); #endif /* __KERNEL_GPU__ */ +/* Smaller float3 for storage. For math operations this must be converted to float3, so that on the + * CPU SIMD instructions can be used. */ +#if defined(__KERNEL_METAL__) +/* Metal has native packed_float3. */ +#elif defined(__KERNEL_CUDA__) +/* CUDA float3 is already packed. */ +typedef float3 packed_float3; +#else +/* HIP float3 is not packed (https://github.com/ROCm-Developer-Tools/HIP/issues/706). */ +struct packed_float3 { + ccl_device_inline_method packed_float3(){}; + + ccl_device_inline_method packed_float3(const float3 &a) : x(a.x), y(a.y), z(a.z) + { + } + + ccl_device_inline_method operator float3() const + { + return make_float3(x, y, z); + } + + ccl_device_inline_method packed_float3 &operator=(const float3 &a) + { + x = a.x; + y = a.y; + z = a.z; + return *this; + } + + float x, y, z; +}; +#endif + +static_assert(sizeof(packed_float3) == 12, "packed_float3 expected to be exactly 12 bytes"); + CCL_NAMESPACE_END #endif /* __UTIL_TYPES_FLOAT3_H__ */ diff --git a/intern/ghost/GHOST_C-api.h b/intern/ghost/GHOST_C-api.h index 784febe8581..98094cc0669 100644 --- a/intern/ghost/GHOST_C-api.h +++ b/intern/ghost/GHOST_C-api.h @@ -729,13 +729,6 @@ extern GHOST_TSuccess GHOST_ReleaseOpenGLContext(GHOST_ContextHandle contexthand extern unsigned int GHOST_GetContextDefaultOpenGLFramebuffer(GHOST_ContextHandle contexthandle); /** - * Returns whether a context is rendered upside down compared to OpenGL. This only needs to be - * called if there's a non-OpenGL context, which is really the exception. - * So generally, this does not need to be called. - */ -extern int GHOST_isUpsideDownContext(GHOST_ContextHandle contexthandle); - -/** * Get the OpenGL frame-buffer handle that serves as a default frame-buffer. */ extern unsigned int GHOST_GetDefaultOpenGLFramebuffer(GHOST_WindowHandle windwHandle); diff --git a/intern/ghost/GHOST_Types.h b/intern/ghost/GHOST_Types.h index 2c8014a08cc..ce0185bc7d0 100644 --- a/intern/ghost/GHOST_Types.h +++ b/intern/ghost/GHOST_Types.h @@ -654,8 +654,8 @@ enum { GHOST_kXrContextDebug = (1 << 0), GHOST_kXrContextDebugTime = (1 << 1), # ifdef WIN32 - /* Needed to avoid issues with the SteamVR OpenGL graphics binding (use DirectX fallback - instead). */ + /* Needed to avoid issues with the SteamVR OpenGL graphics binding + * (use DirectX fallback instead). */ GHOST_kXrContextGpuNVIDIA = (1 << 2), # endif }; diff --git a/intern/ghost/intern/GHOST_SystemCocoa.mm b/intern/ghost/intern/GHOST_SystemCocoa.mm index 204bbdaec50..b92c3e73a88 100644 --- a/intern/ghost/intern/GHOST_SystemCocoa.mm +++ b/intern/ghost/intern/GHOST_SystemCocoa.mm @@ -1245,7 +1245,7 @@ GHOST_TSuccess GHOST_SystemCocoa::handleDraggingEvent(GHOST_TEventType eventType /* Convert the image in a RGBA 32bit format */ /* As Core Graphics does not support contexts with non premutliplied alpha, - we need to get alpha key values in a separate batch */ + * we need to get alpha key values in a separate batch */ /* First get RGB values w/o Alpha to avoid pre-multiplication, * 32bit but last byte is unused */ @@ -1479,8 +1479,8 @@ GHOST_TSuccess GHOST_SystemCocoa::handleMouseEvent(void *eventPtr) CocoaWindow *cocoawindow; /* [event window] returns other windows if mouse-over, that's OSX input standard - however, if mouse exits window(s), the windows become inactive, until you click. - We then fall back to the active window from ghost */ + * however, if mouse exits window(s), the windows become inactive, until you click. + * We then fall back to the active window from ghost. */ window = (GHOST_WindowCocoa *)m_windowManager->getWindowAssociatedWithOSWindow( (void *)[event window]); if (!window) { diff --git a/intern/ghost/intern/GHOST_XrAction.cpp b/intern/ghost/intern/GHOST_XrAction.cpp index 704b1ce9fac..f51f98c9b3d 100644 --- a/intern/ghost/intern/GHOST_XrAction.cpp +++ b/intern/ghost/intern/GHOST_XrAction.cpp @@ -216,8 +216,9 @@ GHOST_XrAction::GHOST_XrAction(XrInstance instance, XrActionCreateInfo action_info{XR_TYPE_ACTION_CREATE_INFO}; strcpy(action_info.actionName, info.name); - strcpy(action_info.localizedActionName, info.name); /* Just use same name for localized. This can - be changed in the future if necessary. */ + + /* Just use same name for localized. This can be changed in the future if necessary. */ + strcpy(action_info.localizedActionName, info.name); switch (info.type) { case GHOST_kXrActionTypeBooleanInput: diff --git a/intern/ghost/intern/GHOST_XrControllerModel.cpp b/intern/ghost/intern/GHOST_XrControllerModel.cpp index ae15bf11aa0..27f92ffe7c5 100644 --- a/intern/ghost/intern/GHOST_XrControllerModel.cpp +++ b/intern/ghost/intern/GHOST_XrControllerModel.cpp @@ -97,8 +97,8 @@ static void read_vertices(const tinygltf::Accessor &accessor, validate_accessor(accessor, buffer_view, buffer, stride, packed_size); /* Resize the vertices vector, if necessary, to include room for the attribute data. - If there are multiple attributes for a primitive, the first one will resize, and the - subsequent will not need to. */ + * If there are multiple attributes for a primitive, the first one will resize, and the + * subsequent will not need to. */ primitive.vertices.resize(accessor.count); /* Copy the attribute value over from the glTF buffer into the appropriate vertex field. */ @@ -147,9 +147,9 @@ static void read_indices(const tinygltf::Accessor &accessor, const tinygltf::Buffer &buffer, GHOST_XrPrimitive &primitive) { - if (buffer_view.target != TINYGLTF_TARGET_ELEMENT_ARRAY_BUFFER && - buffer_view.target != 0) { /* Allow 0 (not specified) even though spec doesn't seem to allow - this (BoomBox GLB fails). */ + + /* Allow 0 (not specified) even though spec doesn't seem to allow this (BoomBox GLB fails). */ + if (buffer_view.target != TINYGLTF_TARGET_ELEMENT_ARRAY_BUFFER && buffer_view.target != 0) { throw GHOST_XrException( "glTF: Accessor for indices uses bufferview with invalid 'target' type."); } @@ -164,8 +164,8 @@ static void read_indices(const tinygltf::Accessor &accessor, validate_accessor(accessor, buffer_view, buffer, component_size_bytes, component_size_bytes); - if ((accessor.count % 3) != 0) { /* Since only triangles are supported, enforce that the number - of indices is divisible by 3. */ + /* Since only triangles are supported, enforce that the number of indices is divisible by 3. */ + if ((accessor.count % 3) != 0) { throw GHOST_XrException("glTF: Unexpected number of indices for triangle primitive"); } diff --git a/intern/guardedalloc/MEM_guardedalloc.h b/intern/guardedalloc/MEM_guardedalloc.h index 713b1fac788..874abb88ff5 100644 --- a/intern/guardedalloc/MEM_guardedalloc.h +++ b/intern/guardedalloc/MEM_guardedalloc.h @@ -78,7 +78,8 @@ extern short (*MEM_testN)(void *vmemh); /** * Duplicates a block of memory, and returns a pointer to the - * newly allocated block. */ + * newly allocated block. + * NULL-safe; will return NULL when receiving a NULL pointer. */ extern void *(*MEM_dupallocN)(const void *vmemh) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT; /** diff --git a/intern/locale/boost_locale_wrapper.cpp b/intern/locale/boost_locale_wrapper.cpp index ede9377b38f..444b51b5e04 100644 --- a/intern/locale/boost_locale_wrapper.cpp +++ b/intern/locale/boost_locale_wrapper.cpp @@ -26,8 +26,8 @@ static std::string messages_path; static std::string default_domain; static std::string locale_str; -/* Note: We cannot use short stuff like boost::locale::gettext, because those return - * std::basic_string objects, which c_ptr()-returned char* is no more valid +/* NOTE: We cannot use short stuff like `boost::locale::gettext`, because those return + * `std::basic_string` objects, which c_ptr()-returned char* is no more valid * once deleted (which happens as soons they are out of scope of this func). */ typedef boost::locale::message_format<char> char_message_facet; static std::locale locale_global; @@ -63,7 +63,7 @@ static void bl_locale_global_cache() void bl_locale_init(const char *_messages_path, const char *_default_domain) { - // Avoid using ICU backend, we do not need its power and it's rather heavy! + /* Avoid using ICU backend, we do not need its power and it's rather heavy! */ boost::locale::localization_backend_manager lman = boost::locale::localization_backend_manager::global(); #if defined(_WIN32) @@ -81,7 +81,7 @@ void bl_locale_set(const char *locale) { boost::locale::generator gen; std::locale _locale; - // Specify location of dictionaries. + /* Specify location of dictionaries. */ gen.add_messages_path(messages_path); gen.add_messages_domain(default_domain); // gen.set_default_messages_domain(default_domain); @@ -99,12 +99,12 @@ void bl_locale_set(const char *locale) #endif } std::locale::global(_locale); - // Note: boost always uses "C" LC_NUMERIC by default! + /* NOTE: boost always uses "C" LC_NUMERIC by default! */ bl_locale_global_cache(); - // Generate the locale string - // (useful to know which locale we are actually using in case of "default" one). + /* Generate the locale string + * (useful to know which locale we are actually using in case of "default" one). */ #define LOCALE_INFO std::use_facet<boost::locale::info>(_locale) locale_str = LOCALE_INFO.language(); @@ -117,10 +117,9 @@ void bl_locale_set(const char *locale) #undef LOCALE_INFO } - // Extra catch on `std::runtime_error` is needed for macOS/Clang as it seems that exceptions - // like `boost::locale::conv::conversion_error` (which inherit from `std::runtime_error`) are - // not caught by their ancestor `std::exception`. See - // https://developer.blender.org/T88877#1177108 . + /* Extra catch on `std::runtime_error` is needed for macOS/Clang as it seems that exceptions + * like `boost::locale::conv::conversion_error` (which inherit from `std::runtime_error`) are + * not caught by their ancestor `std::exception`. See T88877#1177108 */ catch (std::runtime_error const &e) { std::cout << "bl_locale_set(" << locale << "): " << e.what() << " \n"; } diff --git a/intern/sky/include/sky_model.h b/intern/sky/include/sky_model.h index 983b90fed35..752b5c13785 100644 --- a/intern/sky/include/sky_model.h +++ b/intern/sky/include/sky_model.h @@ -133,7 +133,7 @@ function which generates skydome states for different solar emission spectra and solar radii: 'arhosekskymodelstate_alienworld_alloc_init()'. See the notes about the "Alien World" functionality provided further down for a -discussion of the usefulness and limits of that second initalisation function. +discussion of the usefulness and limits of that second initialisation function. Sky model states that have been initialized with either function behave in a completely identical fashion during use and cleanup. @@ -368,7 +368,7 @@ SKY_ArHosekSkyModelState *SKY_arhosekskymodelstate_alloc_init(const double solar with a sun of a surface temperature given in 'kelvin'. The parameter 'solar_intensity' controls the overall brightness of the sky, relative to the solar irradiance on Earth. A value of 1.0 yields a sky dome that - is, on average over the wavelenghts covered in the model (!), as bright + is, on average over the wavelengths covered in the model (!), as bright as the terrestrial sky in radiometric terms. Which means that the solar radius has to be adjusted, since the |