diff options
Diffstat (limited to 'intern/cycles/kernel/device')
20 files changed, 2109 insertions, 1427 deletions
diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h new file mode 100644 index 00000000000..2d7d8c2d704 --- /dev/null +++ b/intern/cycles/kernel/device/cpu/bvh.h @@ -0,0 +1,582 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Blender Foundation */ + +/* CPU Embree implementation of ray-scene intersection. */ + +#pragma once + +#include <embree3/rtcore_ray.h> +#include <embree3/rtcore_scene.h> + +#include "kernel/device/cpu/compat.h" +#include "kernel/device/cpu/globals.h" + +#include "kernel/bvh/types.h" +#include "kernel/bvh/util.h" +#include "kernel/geom/object.h" +#include "kernel/integrator/state.h" +#include "kernel/sample/lcg.h" + +#include "util/vector.h" + +CCL_NAMESPACE_BEGIN + +#define EMBREE_IS_HAIR(x) (x & 1) + +/* Intersection context. */ + +struct CCLIntersectContext { + typedef enum { + RAY_REGULAR = 0, + RAY_SHADOW_ALL = 1, + RAY_LOCAL = 2, + RAY_SSS = 3, + RAY_VOLUME_ALL = 4, + } RayType; + + KernelGlobals kg; + RayType type; + + /* For avoiding self intersections */ + const Ray *ray; + + /* for shadow rays */ + Intersection *isect_s; + uint max_hits; + uint num_hits; + uint num_recorded_hits; + float throughput; + float max_t; + bool opaque_hit; + + /* for SSS Rays: */ + LocalIntersection *local_isect; + int local_object_id; + uint *lcg_state; + + CCLIntersectContext(KernelGlobals kg_, RayType type_) + { + kg = kg_; + type = type_; + ray = NULL; + max_hits = 1; + num_hits = 0; + num_recorded_hits = 0; + throughput = 1.0f; + max_t = FLT_MAX; + opaque_hit = false; + isect_s = NULL; + local_isect = NULL; + local_object_id = -1; + lcg_state = NULL; + } +}; + +class IntersectContext { + public: + IntersectContext(CCLIntersectContext *ctx) + { + rtcInitIntersectContext(&context); + userRayExt = ctx; + } + RTCIntersectContext context; + CCLIntersectContext *userRayExt; +}; + +/* Utilities. */ + +ccl_device_inline void kernel_embree_setup_ray(const Ray &ray, + RTCRay &rtc_ray, + const uint visibility) +{ + rtc_ray.org_x = ray.P.x; + rtc_ray.org_y = ray.P.y; + rtc_ray.org_z = ray.P.z; + rtc_ray.dir_x = ray.D.x; + rtc_ray.dir_y = ray.D.y; + rtc_ray.dir_z = ray.D.z; + rtc_ray.tnear = ray.tmin; + rtc_ray.tfar = ray.tmax; + rtc_ray.time = ray.time; + rtc_ray.mask = visibility; +} + +ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray, + RTCRayHit &rayhit, + const uint visibility) +{ + kernel_embree_setup_ray(ray, rayhit.ray, visibility); + rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID; + rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID; +} + +ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg, + const RTCHit *hit, + const Ray *ray) +{ + int object, prim; + + if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) { + object = hit->instID[0] / 2; + if ((ray->self.object == object) || (ray->self.light_object == object)) { + RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData( + rtcGetGeometry(kernel_data.device_bvh, hit->instID[0])); + prim = hit->primID + + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)); + } + else { + return false; + } + } + else { + object = hit->geomID / 2; + if ((ray->self.object == object) || (ray->self.light_object == object)) { + prim = hit->primID + + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.device_bvh, hit->geomID)); + } + else { + return false; + } + } + + const bool is_hair = hit->geomID & 1; + if (is_hair) { + prim = kernel_data_fetch(curve_segments, prim).prim; + } + + return intersection_skip_self_shadow(ray->self, object, prim); +} + +ccl_device_inline void kernel_embree_convert_hit(KernelGlobals kg, + const RTCRay *ray, + const RTCHit *hit, + Intersection *isect) +{ + isect->t = ray->tfar; + if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) { + RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData( + rtcGetGeometry(kernel_data.device_bvh, hit->instID[0])); + isect->prim = hit->primID + + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)); + isect->object = hit->instID[0] / 2; + } + else { + isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData( + rtcGetGeometry(kernel_data.device_bvh, hit->geomID)); + isect->object = hit->geomID / 2; + } + + const bool is_hair = hit->geomID & 1; + if (is_hair) { + const KernelCurveSegment segment = kernel_data_fetch(curve_segments, isect->prim); + isect->type = segment.type; + isect->prim = segment.prim; + isect->u = hit->u; + isect->v = hit->v; + } + else { + isect->type = kernel_data_fetch(objects, isect->object).primitive_type; + isect->u = hit->u; + isect->v = hit->v; + } +} + +ccl_device_inline void kernel_embree_convert_sss_hit( + KernelGlobals kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int object) +{ + isect->u = hit->u; + isect->v = hit->v; + isect->t = ray->tfar; + RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData( + rtcGetGeometry(kernel_data.device_bvh, object * 2)); + isect->prim = hit->primID + + (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID)); + isect->object = object; + isect->type = kernel_data_fetch(objects, object).primitive_type; +} + +/* Ray filter functions. */ + +/* This gets called by Embree at every valid ray/object intersection. + * Things like recording subsurface or shadow hits for later evaluation + * as well as filtering for volume objects happen here. + * Cycles' own BVH does that directly inside the traversal calls. */ +ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNArguments *args) +{ + /* Current implementation in Cycles assumes only single-ray intersection queries. */ + assert(args->N == 1); + + RTCHit *hit = (RTCHit *)args->hit; + CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt; + const KernelGlobalsCPU *kg = ctx->kg; + const Ray *cray = ctx->ray; + + if (kernel_embree_is_self_intersection(kg, hit, cray)) { + *args->valid = 0; + } +} + +/* This gets called by Embree at every valid ray/object intersection. + * Things like recording subsurface or shadow hits for later evaluation + * as well as filtering for volume objects happen here. + * Cycles' own BVH does that directly inside the traversal calls. + */ +ccl_device void kernel_embree_filter_occluded_func(const RTCFilterFunctionNArguments *args) +{ + /* Current implementation in Cycles assumes only single-ray intersection queries. */ + assert(args->N == 1); + + const RTCRay *ray = (RTCRay *)args->ray; + RTCHit *hit = (RTCHit *)args->hit; + CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt; + const KernelGlobalsCPU *kg = ctx->kg; + const Ray *cray = ctx->ray; + + switch (ctx->type) { + case CCLIntersectContext::RAY_SHADOW_ALL: { + Intersection current_isect; + kernel_embree_convert_hit(kg, ray, hit, ¤t_isect); + if (intersection_skip_self_shadow(cray->self, current_isect.object, current_isect.prim)) { + *args->valid = 0; + return; + } + /* If no transparent shadows or max number of hits exceeded, all light is blocked. */ + const int flags = intersection_get_shader_flags(kg, current_isect.prim, current_isect.type); + if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->num_hits >= ctx->max_hits) { + ctx->opaque_hit = true; + return; + } + + ++ctx->num_hits; + + /* Always use baked shadow transparency for curves. */ + if (current_isect.type & PRIMITIVE_CURVE) { + ctx->throughput *= intersection_curve_shadow_transparency( + kg, current_isect.object, current_isect.prim, current_isect.type, current_isect.u); + + if (ctx->throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) { + ctx->opaque_hit = true; + return; + } + else { + *args->valid = 0; + return; + } + } + + /* Test if we need to record this transparent intersection. */ + const uint max_record_hits = min(ctx->max_hits, INTEGRATOR_SHADOW_ISECT_SIZE); + if (ctx->num_recorded_hits < max_record_hits || ray->tfar < ctx->max_t) { + /* If maximum number of hits was reached, replace the intersection with the + * highest distance. We want to find the N closest intersections. */ + const uint num_recorded_hits = min(ctx->num_recorded_hits, max_record_hits); + uint isect_index = num_recorded_hits; + if (num_recorded_hits + 1 >= max_record_hits) { + float max_t = ctx->isect_s[0].t; + uint max_recorded_hit = 0; + + for (uint i = 1; i < num_recorded_hits; ++i) { + if (ctx->isect_s[i].t > max_t) { + max_recorded_hit = i; + max_t = ctx->isect_s[i].t; + } + } + + if (num_recorded_hits >= max_record_hits) { + isect_index = max_recorded_hit; + } + + /* Limit the ray distance and stop counting hits beyond this. + * TODO: is there some way we can tell Embree to stop intersecting beyond + * this distance when max number of hits is reached?. Or maybe it will + * become irrelevant if we make max_hits a very high number on the CPU. */ + ctx->max_t = max(current_isect.t, max_t); + } + + ctx->isect_s[isect_index] = current_isect; + } + + /* Always increase the number of recorded hits, even beyond the maximum, + * so that we can detect this and trace another ray if needed. */ + ++ctx->num_recorded_hits; + + /* This tells Embree to continue tracing. */ + *args->valid = 0; + break; + } + case CCLIntersectContext::RAY_LOCAL: + case CCLIntersectContext::RAY_SSS: { + /* Check if it's hitting the correct object. */ + Intersection current_isect; + if (ctx->type == CCLIntersectContext::RAY_SSS) { + kernel_embree_convert_sss_hit(kg, ray, hit, ¤t_isect, ctx->local_object_id); + } + else { + kernel_embree_convert_hit(kg, ray, hit, ¤t_isect); + if (ctx->local_object_id != current_isect.object) { + /* This tells Embree to continue tracing. */ + *args->valid = 0; + break; + } + } + if (intersection_skip_self_local(cray->self, current_isect.prim)) { + *args->valid = 0; + return; + } + + /* No intersection information requested, just return a hit. */ + if (ctx->max_hits == 0) { + break; + } + + /* Ignore curves. */ + if (EMBREE_IS_HAIR(hit->geomID)) { + /* This tells Embree to continue tracing. */ + *args->valid = 0; + break; + } + + LocalIntersection *local_isect = ctx->local_isect; + int hit_idx = 0; + + if (ctx->lcg_state) { + /* See triangle_intersect_subsurface() for the native equivalent. */ + for (int i = min((int)ctx->max_hits, local_isect->num_hits) - 1; i >= 0; --i) { + if (local_isect->hits[i].t == ray->tfar) { + /* This tells Embree to continue tracing. */ + *args->valid = 0; + return; + } + } + + local_isect->num_hits++; + + if (local_isect->num_hits <= ctx->max_hits) { + hit_idx = local_isect->num_hits - 1; + } + else { + /* reservoir sampling: if we are at the maximum number of + * hits, randomly replace element or skip it */ + hit_idx = lcg_step_uint(ctx->lcg_state) % local_isect->num_hits; + + if (hit_idx >= ctx->max_hits) { + /* This tells Embree to continue tracing. */ + *args->valid = 0; + return; + } + } + } + else { + /* Record closest intersection only. */ + if (local_isect->num_hits && current_isect.t > local_isect->hits[0].t) { + *args->valid = 0; + return; + } + + local_isect->num_hits = 1; + } + + /* record intersection */ + local_isect->hits[hit_idx] = current_isect; + local_isect->Ng[hit_idx] = normalize(make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)); + /* This tells Embree to continue tracing. */ + *args->valid = 0; + break; + } + case CCLIntersectContext::RAY_VOLUME_ALL: { + /* Append the intersection to the end of the array. */ + if (ctx->num_hits < ctx->max_hits) { + Intersection current_isect; + kernel_embree_convert_hit(kg, ray, hit, ¤t_isect); + if (intersection_skip_self(cray->self, current_isect.object, current_isect.prim)) { + *args->valid = 0; + return; + } + + Intersection *isect = &ctx->isect_s[ctx->num_hits]; + ++ctx->num_hits; + *isect = current_isect; + /* Only primitives from volume object. */ + uint tri_object = isect->object; + int object_flag = kernel_data_fetch(object_flag, tri_object); + if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) { + --ctx->num_hits; + } + /* This tells Embree to continue tracing. */ + *args->valid = 0; + } + break; + } + case CCLIntersectContext::RAY_REGULAR: + default: + if (kernel_embree_is_self_intersection(kg, hit, cray)) { + *args->valid = 0; + return; + } + break; + } +} + +ccl_device void kernel_embree_filter_func_backface_cull(const RTCFilterFunctionNArguments *args) +{ + const RTCRay *ray = (RTCRay *)args->ray; + RTCHit *hit = (RTCHit *)args->hit; + + /* Always ignore back-facing intersections. */ + if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), + make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) { + *args->valid = 0; + return; + } + + CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt; + const KernelGlobalsCPU *kg = ctx->kg; + const Ray *cray = ctx->ray; + + if (kernel_embree_is_self_intersection(kg, hit, cray)) { + *args->valid = 0; + } +} + +ccl_device void kernel_embree_filter_occluded_func_backface_cull( + const RTCFilterFunctionNArguments *args) +{ + const RTCRay *ray = (RTCRay *)args->ray; + RTCHit *hit = (RTCHit *)args->hit; + + /* Always ignore back-facing intersections. */ + if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z), + make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) { + *args->valid = 0; + return; + } + + kernel_embree_filter_occluded_func(args); +} + +/* Scene intersection. */ + +ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg, + ccl_private const Ray *ray, + const uint visibility, + ccl_private Intersection *isect) +{ + isect->t = ray->tmax; + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR); + IntersectContext rtc_ctx(&ctx); + RTCRayHit ray_hit; + ctx.ray = ray; + kernel_embree_setup_rayhit(*ray, ray_hit, visibility); + rtcIntersect1(kernel_data.device_bvh, &rtc_ctx.context, &ray_hit); + if (ray_hit.hit.geomID == RTC_INVALID_GEOMETRY_ID || + ray_hit.hit.primID == RTC_INVALID_GEOMETRY_ID) { + return false; + } + + kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect); + return true; +} + +#ifdef __BVH_LOCAL__ +ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg, + ccl_private const Ray *ray, + ccl_private LocalIntersection *local_isect, + int local_object, + ccl_private uint *lcg_state, + int max_hits) +{ + const bool has_bvh = !(kernel_data_fetch(object_flag, local_object) & + SD_OBJECT_TRANSFORM_APPLIED); + CCLIntersectContext ctx(kg, + has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL); + ctx.lcg_state = lcg_state; + ctx.max_hits = max_hits; + ctx.ray = ray; + ctx.local_isect = local_isect; + if (local_isect) { + local_isect->num_hits = 0; + } + ctx.local_object_id = local_object; + IntersectContext rtc_ctx(&ctx); + RTCRay rtc_ray; + kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_ALL_VISIBILITY); + + /* If this object has its own BVH, use it. */ + if (has_bvh) { + RTCGeometry geom = rtcGetGeometry(kernel_data.device_bvh, local_object * 2); + if (geom) { + float3 P = ray->P; + float3 dir = ray->D; + float3 idir = ray->D; + bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir); + + rtc_ray.org_x = P.x; + rtc_ray.org_y = P.y; + rtc_ray.org_z = P.z; + rtc_ray.dir_x = dir.x; + rtc_ray.dir_y = dir.y; + rtc_ray.dir_z = dir.z; + rtc_ray.tnear = ray->tmin; + rtc_ray.tfar = ray->tmax; + RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom); + kernel_assert(scene); + if (scene) { + rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray); + } + } + } + else { + rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray); + } + + /* rtcOccluded1 sets tfar to -inf if a hit was found. */ + return (local_isect && local_isect->num_hits > 0) || (rtc_ray.tfar < 0); +} +#endif + +#ifdef __SHADOW_RECORD_ALL__ +ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg, + IntegratorShadowStateCPU *state, + ccl_private const Ray *ray, + uint visibility, + uint max_hits, + ccl_private uint *num_recorded_hits, + ccl_private float *throughput) +{ + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL); + Intersection *isect_array = (Intersection *)state->shadow_isect; + ctx.isect_s = isect_array; + ctx.max_hits = max_hits; + ctx.ray = ray; + IntersectContext rtc_ctx(&ctx); + RTCRay rtc_ray; + kernel_embree_setup_ray(*ray, rtc_ray, visibility); + rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray); + + *num_recorded_hits = ctx.num_recorded_hits; + *throughput = ctx.throughput; + return ctx.opaque_hit; +} +#endif + +#ifdef __VOLUME__ +ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg, + ccl_private const Ray *ray, + ccl_private Intersection *isect, + const uint max_hits, + const uint visibility) +{ + CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL); + ctx.isect_s = isect; + ctx.max_hits = max_hits; + ctx.num_hits = 0; + ctx.ray = ray; + IntersectContext rtc_ctx(&ctx); + RTCRay rtc_ray; + kernel_embree_setup_ray(*ray, rtc_ray, visibility); + rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray); + return ctx.num_hits; +} +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h index 3bfc37e98ee..1e3e790ca1f 100644 --- a/intern/cycles/kernel/device/cpu/compat.h +++ b/intern/cycles/kernel/device/cpu/compat.h @@ -3,8 +3,6 @@ #pragma once -#define __KERNEL_CPU__ - /* Release kernel has too much false-positive maybe-uninitialized warnings, * which makes it possible to miss actual warnings. */ @@ -35,38 +33,4 @@ CCL_NAMESPACE_BEGIN #define kernel_assert(cond) assert(cond) -/* Macros to handle different memory storage on different devices */ - -#ifdef __KERNEL_SSE2__ -typedef vector3<sseb> sse3b; -typedef vector3<ssef> sse3f; -typedef vector3<ssei> sse3i; - -ccl_device_inline void print_sse3b(const char *label, sse3b &a) -{ - print_sseb(label, a.x); - print_sseb(label, a.y); - print_sseb(label, a.z); -} - -ccl_device_inline void print_sse3f(const char *label, sse3f &a) -{ - print_ssef(label, a.x); - print_ssef(label, a.y); - print_ssef(label, a.z); -} - -ccl_device_inline void print_sse3i(const char *label, sse3i &a) -{ - print_ssei(label, a.x); - print_ssei(label, a.y); - print_ssei(label, a.z); -} - -# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__) -typedef vector3<avxf> avx3f; -# endif - -#endif - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h index 309afae412e..f7f1a36b2a7 100644 --- a/intern/cycles/kernel/device/cpu/globals.h +++ b/intern/cycles/kernel/device/cpu/globals.h @@ -9,6 +9,8 @@ #include "kernel/types.h" #include "kernel/util/profiling.h" +#include "util/guiding.h" + CCL_NAMESPACE_BEGIN /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in @@ -43,9 +45,20 @@ typedef struct KernelGlobalsCPU { #ifdef __OSL__ /* On the CPU, we also have the OSL globals here. Most data structures are shared * with SVM, the difference is in the shaders and object/mesh attributes. */ - OSLGlobals *osl; - OSLShadingSystem *osl_ss; - OSLThreadData *osl_tdata; + OSLGlobals *osl = nullptr; + OSLShadingSystem *osl_ss = nullptr; + OSLThreadData *osl_tdata = nullptr; +#endif + +#ifdef __PATH_GUIDING__ + /* Pointers to global data structures. */ + openpgl::cpp::SampleStorage *opgl_sample_data_storage = nullptr; + openpgl::cpp::Field *opgl_guiding_field = nullptr; + + /* Local data structures owned by the thread. */ + openpgl::cpp::PathSegmentStorage *opgl_path_segment_storage = nullptr; + openpgl::cpp::SurfaceSamplingDistribution *opgl_surface_sampling_distribution = nullptr; + openpgl::cpp::VolumeSamplingDistribution *opgl_volume_sampling_distribution = nullptr; #endif /* **** Run-time data **** */ diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h index 0e5f7b4a2fd..0d7c06f4fc6 100644 --- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h +++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h @@ -34,7 +34,7 @@ # include "kernel/integrator/megakernel.h" # include "kernel/film/adaptive_sampling.h" -# include "kernel/film/id_passes.h" +# include "kernel/film/cryptomatte_passes.h" # include "kernel/film/read.h" # include "kernel/bake/bake.h" @@ -169,7 +169,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)( STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check); return false; #else - return kernel_adaptive_sampling_convergence_check( + return film_adaptive_sampling_convergence_check( kg, render_buffer, x, y, threshold, reset, offset, stride); #endif } @@ -185,7 +185,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCP #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x); #else - kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride); + film_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride); #endif } @@ -200,7 +200,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCP #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y); #else - kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride); + film_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride); #endif } @@ -215,7 +215,7 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU * #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess); #else - kernel_cryptomatte_post(kg, render_buffer, pixel_index); + film_cryptomatte_post(kg, render_buffer, pixel_index); #endif } diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index e1ab802aa80..d7d2000775f 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -526,7 +526,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) bool converged = true; if (x < sw && y < sh) { - converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check( + converged = ccl_gpu_kernel_call(film_adaptive_sampling_convergence_check( nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride)); } @@ -553,7 +553,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (y < sh) { ccl_gpu_kernel_call( - kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride)); + film_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride)); } } ccl_gpu_kernel_postfix @@ -572,7 +572,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) if (x < sw) { ccl_gpu_kernel_call( - kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride)); + film_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride)); } } ccl_gpu_kernel_postfix @@ -589,7 +589,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) const int pixel_index = ccl_gpu_global_id_x(); if (pixel_index < num_pixels) { - ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index)); + ccl_gpu_kernel_call(film_cryptomatte_post(nullptr, render_buffer, pixel_index)); } } ccl_gpu_kernel_postfix diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index c1df49c4f49..38cdcb572eb 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN * and keep device specific code in compat.h */ #ifdef __KERNEL_ONEAPI__ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -template<typename IsActiveOp> -void cpu_serial_active_index_array_impl(const uint num_states, - ccl_global int *ccl_restrict indices, - ccl_global int *ccl_restrict num_indices, - IsActiveOp is_active_op) -{ - int write_index = 0; - for (int state_index = 0; state_index < num_states; state_index++) { - if (is_active_op(state_index)) - indices[write_index++] = state_index; - } - *num_indices = write_index; - return; -} -# endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */ template<typename IsActiveOp> void gpu_parallel_active_index_array_impl(const uint num_states, @@ -182,18 +166,11 @@ __device__ num_simd_groups, \ simdgroup_offset) #elif defined(__KERNEL_ONEAPI__) -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -# define gpu_parallel_active_index_array( \ - blocksize, num_states, indices, num_indices, is_active_op) \ - if (ccl_gpu_global_size_x() == 1) \ - cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \ - else \ - gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op); -# else -# define gpu_parallel_active_index_array( \ - blocksize, num_states, indices, num_indices, is_active_op) \ - gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) -# endif + +# define gpu_parallel_active_index_array( \ + blocksize, num_states, indices, num_indices, is_active_op) \ + gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op) + #else # define gpu_parallel_active_index_array( \ diff --git a/intern/cycles/kernel/device/metal/bvh.h b/intern/cycles/kernel/device/metal/bvh.h new file mode 100644 index 00000000000..03faa3f020f --- /dev/null +++ b/intern/cycles/kernel/device/metal/bvh.h @@ -0,0 +1,360 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Blender Foundation */ + +/* MetalRT implementation of ray-scene intersection. */ + +#pragma once + +#include "kernel/bvh/types.h" +#include "kernel/bvh/util.h" + +CCL_NAMESPACE_BEGIN + +/* Payload types. */ + +struct MetalRTIntersectionPayload { + RaySelfPrimitives self; + uint visibility; + float u, v; + int prim; + int type; +#if defined(__METALRT_MOTION__) + float time; +#endif +}; + +struct MetalRTIntersectionLocalPayload { + RaySelfPrimitives self; + uint local_object; + uint lcg_state; + short max_hits; + bool has_lcg_state; + bool result; + LocalIntersection local_isect; +}; + +struct MetalRTIntersectionShadowPayload { + RaySelfPrimitives self; + uint visibility; +#if defined(__METALRT_MOTION__) + float time; +#endif + int state; + float throughput; + short max_hits; + short num_hits; + short num_recorded_hits; + bool result; +}; + +/* Scene intersection. */ + +ccl_device_intersect bool scene_intersect(KernelGlobals kg, + ccl_private const Ray *ray, + const uint visibility, + ccl_private Intersection *isect) +{ + if (!intersection_ray_valid(ray)) { + isect->t = ray->tmax; + isect->type = PRIMITIVE_NONE; + return false; + } + +#if defined(__KERNEL_DEBUG__) + if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) { + isect->t = ray->tmax; + isect->type = PRIMITIVE_NONE; + kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer"); + return false; + } + + if (is_null_intersection_function_table(metal_ancillaries->ift_default)) { + isect->t = ray->tmax; + isect->type = PRIMITIVE_NONE; + kernel_assert(!"Invalid ift_default"); + return false; + } +#endif + + metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax); + metalrt_intersector_type metalrt_intersect; + + if (!kernel_data.bvh.have_curves) { + metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle); + } + + MetalRTIntersectionPayload payload; + payload.self = ray->self; + payload.u = 0.0f; + payload.v = 0.0f; + payload.visibility = visibility; + + typename metalrt_intersector_type::result_type intersection; + + uint ray_mask = visibility & 0xFF; + if (0 == ray_mask && (visibility & ~0xFF) != 0) { + ray_mask = 0xFF; + /* No further intersector setup required: Default MetalRT behavior is any-hit. */ + } + else if (visibility & PATH_RAY_SHADOW_OPAQUE) { + /* No further intersector setup required: Shadow ray early termination is controlled by the + * intersection handler */ + } + +#if defined(__METALRT_MOTION__) + payload.time = ray->time; + intersection = metalrt_intersect.intersect(r, + metal_ancillaries->accel_struct, + ray_mask, + ray->time, + metal_ancillaries->ift_default, + payload); +#else + intersection = metalrt_intersect.intersect( + r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload); +#endif + + if (intersection.type == intersection_type::none) { + isect->t = ray->tmax; + isect->type = PRIMITIVE_NONE; + + return false; + } + + isect->t = intersection.distance; + + isect->prim = payload.prim; + isect->type = payload.type; + isect->object = intersection.user_instance_id; + + isect->t = intersection.distance; + if (intersection.type == intersection_type::triangle) { + isect->u = intersection.triangle_barycentric_coord.x; + isect->v = intersection.triangle_barycentric_coord.y; + } + else { + isect->u = payload.u; + isect->v = payload.v; + } + + return isect->type != PRIMITIVE_NONE; +} + +#ifdef __BVH_LOCAL__ +ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, + ccl_private const Ray *ray, + ccl_private LocalIntersection *local_isect, + int local_object, + ccl_private uint *lcg_state, + int max_hits) +{ + if (!intersection_ray_valid(ray)) { + if (local_isect) { + local_isect->num_hits = 0; + } + return false; + } + +# if defined(__KERNEL_DEBUG__) + if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) { + if (local_isect) { + local_isect->num_hits = 0; + } + kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer"); + return false; + } + + if (is_null_intersection_function_table(metal_ancillaries->ift_local)) { + if (local_isect) { + local_isect->num_hits = 0; + } + kernel_assert(!"Invalid ift_local"); + return false; + } +# endif + + metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax); + metalrt_intersector_type metalrt_intersect; + + metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque); + if (!kernel_data.bvh.have_curves) { + metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle); + } + + MetalRTIntersectionLocalPayload payload; + payload.self = ray->self; + payload.local_object = local_object; + payload.max_hits = max_hits; + payload.local_isect.num_hits = 0; + if (lcg_state) { + payload.has_lcg_state = true; + payload.lcg_state = *lcg_state; + } + payload.result = false; + + typename metalrt_intersector_type::result_type intersection; + +# if defined(__METALRT_MOTION__) + intersection = metalrt_intersect.intersect( + r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload); +# else + intersection = metalrt_intersect.intersect( + r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload); +# endif + + if (lcg_state) { + *lcg_state = payload.lcg_state; + } + *local_isect = payload.local_isect; + + return payload.result; +} +#endif + +#ifdef __SHADOW_RECORD_ALL__ +ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, + IntegratorShadowState state, + ccl_private const Ray *ray, + uint visibility, + uint max_hits, + ccl_private uint *num_recorded_hits, + ccl_private float *throughput) +{ + if (!intersection_ray_valid(ray)) { + return false; + } + +# if defined(__KERNEL_DEBUG__) + if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) { + kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer"); + return false; + } + + if (is_null_intersection_function_table(metal_ancillaries->ift_shadow)) { + kernel_assert(!"Invalid ift_shadow"); + return false; + } +# endif + + metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax); + metalrt_intersector_type metalrt_intersect; + + metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque); + if (!kernel_data.bvh.have_curves) { + metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle); + } + + MetalRTIntersectionShadowPayload payload; + payload.self = ray->self; + payload.visibility = visibility; + payload.max_hits = max_hits; + payload.num_hits = 0; + payload.num_recorded_hits = 0; + payload.throughput = 1.0f; + payload.result = false; + payload.state = state; + + uint ray_mask = visibility & 0xFF; + if (0 == ray_mask && (visibility & ~0xFF) != 0) { + ray_mask = 0xFF; + } + + typename metalrt_intersector_type::result_type intersection; + +# if defined(__METALRT_MOTION__) + payload.time = ray->time; + intersection = metalrt_intersect.intersect(r, + metal_ancillaries->accel_struct, + ray_mask, + ray->time, + metal_ancillaries->ift_shadow, + payload); +# else + intersection = metalrt_intersect.intersect( + r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_shadow, payload); +# endif + + *num_recorded_hits = payload.num_recorded_hits; + *throughput = payload.throughput; + + return payload.result; +} +#endif + +#ifdef __VOLUME__ +ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg, + ccl_private const Ray *ray, + ccl_private Intersection *isect, + const uint visibility) +{ + if (!intersection_ray_valid(ray)) { + return false; + } + +# if defined(__KERNEL_DEBUG__) + if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) { + kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer"); + return false; + } + + if (is_null_intersection_function_table(metal_ancillaries->ift_default)) { + kernel_assert(!"Invalid ift_default"); + return false; + } +# endif + + metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax); + metalrt_intersector_type metalrt_intersect; + + metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque); + if (!kernel_data.bvh.have_curves) { + metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle); + } + + MetalRTIntersectionPayload payload; + payload.self = ray->self; + payload.visibility = visibility; + + typename metalrt_intersector_type::result_type intersection; + + uint ray_mask = visibility & 0xFF; + if (0 == ray_mask && (visibility & ~0xFF) != 0) { + ray_mask = 0xFF; + } + +# if defined(__METALRT_MOTION__) + payload.time = ray->time; + intersection = metalrt_intersect.intersect(r, + metal_ancillaries->accel_struct, + ray_mask, + ray->time, + metal_ancillaries->ift_default, + payload); +# else + intersection = metalrt_intersect.intersect( + r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload); +# endif + + if (intersection.type == intersection_type::none) { + return false; + } + + isect->prim = payload.prim; + isect->type = payload.type; + isect->object = intersection.user_instance_id; + + isect->t = intersection.distance; + if (intersection.type == intersection_type::triangle) { + isect->u = intersection.triangle_barycentric_coord.x; + isect->v = intersection.triangle_barycentric_coord.y; + } + else { + isect->u = payload.u; + isect->v = payload.v; + } + + return isect->type != PRIMITIVE_NONE; +} +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 0ed52074a90..f689e93e5a2 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -29,24 +29,13 @@ using namespace metal::raytracing; /* Qualifiers */ +#define ccl_device +#define ccl_device_inline ccl_device __attribute__((always_inline)) +#define ccl_device_forceinline ccl_device __attribute__((always_inline)) #if defined(__KERNEL_METAL_APPLE__) - -/* Inline everything for Apple GPUs. - * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface - * at the cost of longer compile times (~4.5 minutes on M1 Max). */ - -# define ccl_device __attribute__((always_inline)) -# define ccl_device_inline __attribute__((always_inline)) -# define ccl_device_forceinline __attribute__((always_inline)) -# define ccl_device_noinline __attribute__((always_inline)) - +# define ccl_device_noinline ccl_device #else - -# define ccl_device -# define ccl_device_inline ccl_device -# define ccl_device_forceinline ccl_device # define ccl_device_noinline ccl_device __attribute__((noinline)) - #endif #define ccl_device_noinline_cpu ccl_device @@ -189,35 +178,46 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \ } volume_write_lambda_pass{kg, this, state}; /* make_type definitions with Metal style element initializers */ -#ifdef make_float2 -# undef make_float2 -#endif -#ifdef make_float3 -# undef make_float3 -#endif -#ifdef make_float4 -# undef make_float4 -#endif -#ifdef make_int2 -# undef make_int2 -#endif -#ifdef make_int3 -# undef make_int3 -#endif -#ifdef make_int4 -# undef make_int4 -#endif -#ifdef make_uchar4 -# undef make_uchar4 -#endif - -#define make_float2(x, y) float2(x, y) -#define make_float3(x, y, z) float3(x, y, z) -#define make_float4(x, y, z, w) float4(x, y, z, w) -#define make_int2(x, y) int2(x, y) -#define make_int3(x, y, z) int3(x, y, z) -#define make_int4(x, y, z, w) int4(x, y, z, w) -#define make_uchar4(x, y, z, w) uchar4(x, y, z, w) +ccl_device_forceinline float2 make_float2(const float x, const float y) +{ + return float2(x, y); +} + +ccl_device_forceinline float3 make_float3(const float x, const float y, const float z) +{ + return float3(x, y, z); +} + +ccl_device_forceinline float4 make_float4(const float x, + const float y, + const float z, + const float w) +{ + return float4(x, y, z, w); +} + +ccl_device_forceinline int2 make_int2(const int x, const int y) +{ + return int2(x, y); +} + +ccl_device_forceinline int3 make_int3(const int x, const int y, const int z) +{ + return int3(x, y, z); +} + +ccl_device_forceinline int4 make_int4(const int x, const int y, const int z, const int w) +{ + return int4(x, y, z, w); +} + +ccl_device_forceinline uchar4 make_uchar4(const uchar x, + const uchar y, + const uchar z, + const uchar w) +{ + return uchar4(x, y, z, w); +} /* Math functions */ @@ -260,8 +260,6 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \ #ifdef __METALRT__ -# define __KERNEL_GPU_RAYTRACING__ - # if defined(__METALRT_MOTION__) # define METALRT_TAGS instancing, instance_motion, primitive_motion # else diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h index 99cb1e3826e..e75ec9cadec 100644 --- a/intern/cycles/kernel/device/metal/context_begin.h +++ b/intern/cycles/kernel/device/metal/context_begin.h @@ -34,21 +34,48 @@ class MetalKernelContext { kernel_assert(0); return 0; } - + +#ifdef __KERNEL_METAL_INTEL__ + template<typename TextureType, typename CoordsType> + inline __attribute__((__always_inline__)) + auto ccl_gpu_tex_object_read_intel_workaround(TextureType texture_array, + const uint tid, const uint sid, + CoordsType coords) const + { + switch(sid) { + default: + case 0: return texture_array[tid].tex.sample(sampler(address::repeat, filter::nearest), coords); + case 1: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::nearest), coords); + case 2: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::nearest), coords); + case 3: return texture_array[tid].tex.sample(sampler(address::repeat, filter::linear), coords); + case 4: return texture_array[tid].tex.sample(sampler(address::clamp_to_edge, filter::linear), coords); + case 5: return texture_array[tid].tex.sample(sampler(address::clamp_to_zero, filter::linear), coords); + } + } +#endif + // texture2d template<> inline __attribute__((__always_inline__)) float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const { const uint tid(tex); const uint sid(tex >> 32); +#ifndef __KERNEL_METAL_INTEL__ return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)); +#else + return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y)); +#endif } template<> inline __attribute__((__always_inline__)) float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object_2D tex, float x, float y) const { const uint tid(tex); const uint sid(tex >> 32); +#ifndef __KERNEL_METAL_INTEL__ return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x; +#else + return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_2d, tid, sid, float2(x, y)).x; +#endif } // texture3d @@ -57,14 +84,22 @@ class MetalKernelContext { float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const { const uint tid(tex); const uint sid(tex >> 32); +#ifndef __KERNEL_METAL_INTEL__ return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)); +#else + return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z)); +#endif } template<> inline __attribute__((__always_inline__)) float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object_3D tex, float x, float y, float z) const { const uint tid(tex); const uint sid(tex >> 32); +#ifndef __KERNEL_METAL_INTEL__ return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x; +#else + return ccl_gpu_tex_object_read_intel_workaround(metal_ancillaries->textures_3d, tid, sid, float3(x, y, z)).x; +#endif } # include "kernel/device/gpu/image.h" diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal index 764c26dbe8f..8b69ee025cd 100644 --- a/intern/cycles/kernel/device/metal/kernel.metal +++ b/intern/cycles/kernel/device/metal/kernel.metal @@ -1,41 +1,44 @@ /* SPDX-License-Identifier: Apache-2.0 * Copyright 2021-2022 Blender Foundation */ -/* Metal kernel entry points */ +/* Metal kernel entry points. */ #include "kernel/device/metal/compat.h" #include "kernel/device/metal/globals.h" #include "kernel/device/metal/function_constants.h" #include "kernel/device/gpu/kernel.h" -/* MetalRT intersection handlers */ +/* MetalRT intersection handlers. */ + #ifdef __METALRT__ -/* Return type for a bounding box intersection function. */ -struct BoundingBoxIntersectionResult -{ +/* Intersection return types. */ + +/* For a bounding box intersection function. */ +struct BoundingBoxIntersectionResult { bool accept [[accept_intersection]]; bool continue_search [[continue_search]]; float distance [[distance]]; }; -/* Return type for a triangle intersection function. */ -struct TriangleIntersectionResult -{ +/* For a triangle intersection function. */ +struct TriangleIntersectionResult { bool accept [[accept_intersection]]; - bool continue_search [[continue_search]]; + bool continue_search [[continue_search]]; }; enum { METALRT_HIT_TRIANGLE, METALRT_HIT_BOUNDING_BOX }; -ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives& self, +/* Utilities. */ + +ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives &self, const int object, const int prim) { return (self.prim == prim) && (self.object == object); } -ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives& self, +ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives &self, const int object, const int prim) { @@ -43,12 +46,14 @@ ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimi ((self.light_prim == prim) && (self.light_object == object)); } -ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives& self, +ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives &self, const int prim) { return (self.prim == prim); } +/* Hit functions. */ + template<typename TReturn, uint intersection_type> TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal, ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload, @@ -58,7 +63,7 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal, const float ray_tmax) { TReturn result; - + #ifdef __BVH_LOCAL__ uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); @@ -101,7 +106,8 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal, } else { if (payload.local_isect.num_hits && ray_tmax > payload.local_isect.hits[0].t) { - /* Record closest intersection only. Do not terminate ray here, since there is no guarantee about distance ordering in any-hit */ + /* Record closest intersection only. Do not terminate ray here, since there is no guarantee + * about distance ordering in any-hit */ result.accept = false; result.continue_search = true; return result; @@ -116,8 +122,8 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal, isect->object = object; isect->type = kernel_data_fetch(objects, object).primitive_type; - isect->u = 1.0f - barycentrics.y - barycentrics.x; - isect->v = barycentrics.x; + isect->u = barycentrics.x; + isect->v = barycentrics.y; /* Record geometric normal */ const uint tri_vindex = kernel_data_fetch(tri_vindex, isect->prim).w; @@ -133,21 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal, #endif } -[[intersection(triangle, triangle_data, METALRT_TAGS)]] -TriangleIntersectionResult -__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], - ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]], - uint instance_id [[user_instance_id]], - uint primitive_id [[primitive_id]], - float2 barycentrics [[barycentric_coord]], - float ray_tmax [[distance]]) +[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult +__anyhit__cycles_metalrt_local_hit_tri( + constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]], + uint instance_id [[user_instance_id]], + uint primitive_id [[primitive_id]], + float2 barycentrics [[barycentric_coord]], + float ray_tmax [[distance]]) { return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>( - launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax); + launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax); } -[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] -BoundingBoxIntersectionResult +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]]) { /* unused function */ @@ -175,23 +180,14 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, } # endif - if (intersection_skip_self_shadow(payload.self, object, prim)) { - /* continue search */ - return true; - } - - float u = 0.0f, v = 0.0f; + const float u = barycentrics.x; + const float v = barycentrics.y; int type = 0; if (intersection_type == METALRT_HIT_TRIANGLE) { - u = 1.0f - barycentrics.y - barycentrics.x; - v = barycentrics.x; type = kernel_data_fetch(objects, object).primitive_type; } # ifdef __HAIR__ else { - u = barycentrics.x; - v = barycentrics.y; - const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim); type = segment.type; prim = segment.prim; @@ -204,6 +200,11 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, } # endif + if (intersection_skip_self_shadow(payload.self, object, prim)) { + /* continue search */ + return true; + } + # ifndef __TRANSPARENT_SHADOWS__ /* No transparent shadows support compiled in, make opaque. */ payload.result = true; @@ -215,7 +216,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, short num_recorded_hits = payload.num_recorded_hits; MetalKernelContext context(launch_params_metal); - + /* If no transparent shadows, all light is blocked and we can stop immediately. */ if (num_hits >= max_hits || !(context.intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) { @@ -223,11 +224,11 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, /* terminate ray */ return false; } - + /* Always use baked shadow transparency for curves. */ if (type & PRIMITIVE_CURVE) { float throughput = payload.throughput; - throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u); + throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, type, u); payload.throughput = throughput; payload.num_hits += 1; @@ -240,10 +241,10 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, return true; } } - + payload.num_hits += 1; payload.num_recorded_hits += 1; - + uint record_index = num_recorded_hits; const IntegratorShadowState state = payload.state; @@ -278,7 +279,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim; INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object; INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type; - + /* Continue tracing. */ # endif /* __TRANSPARENT_SHADOWS__ */ #endif /* __SHADOW_RECORD_ALL__ */ @@ -286,26 +287,25 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal, return true; } -[[intersection(triangle, triangle_data, METALRT_TAGS)]] -TriangleIntersectionResult -__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], - ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], - unsigned int object [[user_instance_id]], - unsigned int primitive_id [[primitive_id]], - float2 barycentrics [[barycentric_coord]], - float ray_tmax [[distance]]) +[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult +__anyhit__cycles_metalrt_shadow_all_hit_tri( + constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], + unsigned int object [[user_instance_id]], + unsigned int primitive_id [[primitive_id]], + float2 barycentrics [[barycentric_coord]], + float ray_tmax [[distance]]) { uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); TriangleIntersectionResult result; result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_TRIANGLE>( - launch_params_metal, payload, object, prim, barycentrics, ray_tmax); + launch_params_metal, payload, object, prim, barycentrics, ray_tmax); result.accept = !result.continue_search; return result; } -[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] -BoundingBoxIntersectionResult +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult __anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]]) { /* unused function */ @@ -317,15 +317,16 @@ __anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance] } template<typename TReturnType, uint intersection_type> -inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_params_metal, - ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, - const uint object, - const uint prim, - const float u) +inline TReturnType metalrt_visibility_test( + constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, + const uint object, + uint prim, + const float u) { TReturnType result; - -# ifdef __HAIR__ + +#ifdef __HAIR__ if (intersection_type == METALRT_HIT_BOUNDING_BOX) { /* Filter out curve endcaps. */ if (u == 0.0f || u == 1.0f) { @@ -334,15 +335,23 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa return result; } } -# endif +#endif uint visibility = payload.visibility; -# ifdef __VISIBILITY_FLAG__ +#ifdef __VISIBILITY_FLAG__ if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { result.accept = false; result.continue_search = true; return result; } +#endif + + if (intersection_type == METALRT_HIT_TRIANGLE) { + } +# ifdef __HAIR__ + else { + prim = kernel_data_fetch(curve_segments, prim).prim; + } # endif /* Shadow ray early termination. */ @@ -371,16 +380,17 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa return result; } -[[intersection(triangle, triangle_data, METALRT_TAGS)]] -TriangleIntersectionResult -__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], - ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], - unsigned int object [[user_instance_id]], - unsigned int primitive_id [[primitive_id]]) +[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult +__anyhit__cycles_metalrt_visibility_test_tri( + constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], + unsigned int object [[user_instance_id]], + unsigned int primitive_id [[primitive_id]]) { uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); - TriangleIntersectionResult result = metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>( - launch_params_metal, payload, object, prim, 0.0f); + TriangleIntersectionResult result = + metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>( + launch_params_metal, payload, object, prim, 0.0f); if (result.accept) { payload.prim = prim; payload.type = kernel_data_fetch(objects, object).primitive_type; @@ -388,8 +398,7 @@ __anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_ return result; } -[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] -BoundingBoxIntersectionResult +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult __anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]]) { /* Unused function */ @@ -400,19 +409,21 @@ __anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance return result; } +/* Primitive intersection functions. */ + #ifdef __HAIR__ -ccl_device_inline -void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal, - ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, - const uint object, - const uint prim, - const uint type, - const float3 ray_origin, - const float3 ray_direction, - float time, - const float ray_tmin, - const float ray_tmax, - thread BoundingBoxIntersectionResult &result) +ccl_device_inline void metalrt_intersection_curve( + constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, + const uint object, + const uint prim, + const uint type, + const float3 ray_P, + const float3 ray_D, + float time, + const float ray_tmin, + const float ray_tmax, + thread BoundingBoxIntersectionResult &result) { # ifdef __VISIBILITY_FLAG__ const uint visibility = payload.visibility; @@ -421,25 +432,16 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal, } # endif - float3 P = ray_origin; - float3 dir = ray_direction; - - /* The direction is not normalized by default, but the curve intersection routine expects that */ - float len; - dir = normalize_len(dir, &len); - Intersection isect; isect.t = ray_tmax; - /* Transform maximum distance into object space. */ - if (isect.t != FLT_MAX) - isect.t *= len; MetalKernelContext context(launch_params_metal); - if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) { + if (context.curve_intersect( + NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) { result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>( - launch_params_metal, payload, object, prim, isect.u); + launch_params_metal, payload, object, prim, isect.u); if (result.accept) { - result.distance = isect.t / len; + result.distance = isect.t; payload.u = isect.u; payload.v = isect.v; payload.prim = prim; @@ -448,54 +450,41 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal, } } -ccl_device_inline -void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params_metal, - ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload, - const uint object, - const uint prim, - const uint type, - const float3 ray_origin, - const float3 ray_direction, - float time, - const float ray_tmin, - const float ray_tmax, - thread BoundingBoxIntersectionResult &result) +ccl_device_inline void metalrt_intersection_curve_shadow( + constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload, + const uint object, + const uint prim, + const uint type, + const float3 ray_P, + const float3 ray_D, + float time, + const float ray_tmin, + const float ray_tmax, + thread BoundingBoxIntersectionResult &result) { const uint visibility = payload.visibility; - float3 P = ray_origin; - float3 dir = ray_direction; - - /* The direction is not normalized by default, but the curve intersection routine expects that */ - float len; - dir = normalize_len(dir, &len); - Intersection isect; isect.t = ray_tmax; - /* Transform maximum distance into object space */ - if (isect.t != FLT_MAX) - isect.t *= len; MetalKernelContext context(launch_params_metal); - if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) { + if (context.curve_intersect( + NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) { result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>( - launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax); + launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax); result.accept = !result.continue_search; - - if (result.accept) { - result.distance = isect.t / len; - } } } -[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] -BoundingBoxIntersectionResult +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], - ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload + [[payload]], const uint object [[user_instance_id]], const uint primitive_id [[primitive_id]], - const float3 ray_origin [[origin]], - const float3 ray_direction [[direction]], + const float3 ray_P [[origin]], + const float3 ray_D [[direction]], const float ray_tmin [[min_distance]], const float ray_tmax [[max_distance]]) { @@ -508,28 +497,36 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b result.distance = ray_tmax; if (segment.type & PRIMITIVE_CURVE_RIBBON) { - metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction, + metalrt_intersection_curve(launch_params_metal, + payload, + object, + segment.prim, + segment.type, + ray_P, + ray_D, # if defined(__METALRT_MOTION__) payload.time, # else 0.0f, # endif - ray_tmin, ray_tmax, result); + ray_tmin, + ray_tmax, + result); } return result; } -[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] -BoundingBoxIntersectionResult -__intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], - ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], - const uint object [[user_instance_id]], - const uint primitive_id [[primitive_id]], - const float3 ray_origin [[origin]], - const float3 ray_direction [[direction]], - const float ray_tmin [[min_distance]], - const float ray_tmax [[max_distance]]) +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult +__intersection__curve_ribbon_shadow( + constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_P [[origin]], + const float3 ray_D [[direction]], + const float ray_tmin [[min_distance]], + const float ray_tmax [[max_distance]]) { uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim); @@ -540,57 +537,73 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me result.distance = ray_tmax; if (segment.type & PRIMITIVE_CURVE_RIBBON) { - metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction, + metalrt_intersection_curve_shadow(launch_params_metal, + payload, + object, + segment.prim, + segment.type, + ray_P, + ray_D, # if defined(__METALRT_MOTION__) - payload.time, + payload.time, # else - 0.0f, + 0.0f, # endif - ray_tmin, ray_tmax, result); + ray_tmin, + ray_tmax, + result); } return result; } -[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] -BoundingBoxIntersectionResult +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult __intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], - ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload + [[payload]], const uint object [[user_instance_id]], const uint primitive_id [[primitive_id]], - const float3 ray_origin [[origin]], - const float3 ray_direction [[direction]], + const float3 ray_P [[origin]], + const float3 ray_D [[direction]], const float ray_tmin [[min_distance]], const float ray_tmax [[max_distance]]) { uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim); - + BoundingBoxIntersectionResult result; result.accept = false; result.continue_search = true; result.distance = ray_tmax; - metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction, + metalrt_intersection_curve(launch_params_metal, + payload, + object, + segment.prim, + segment.type, + ray_P, + ray_D, # if defined(__METALRT_MOTION__) payload.time, # else 0.0f, # endif - ray_tmin, ray_tmax, result); + ray_tmin, + ray_tmax, + result); return result; } -[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] -BoundingBoxIntersectionResult -__intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], - ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], - const uint object [[user_instance_id]], - const uint primitive_id [[primitive_id]], - const float3 ray_origin [[origin]], - const float3 ray_direction [[direction]], - const float ray_tmin [[min_distance]], - const float ray_tmax [[max_distance]]) +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult +__intersection__curve_all_shadow( + constant KernelParamsMetal &launch_params_metal [[buffer(1)]], + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_P [[origin]], + const float3 ray_D [[direction]], + const float ray_tmin [[min_distance]], + const float ray_tmax [[max_distance]]) { uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim); @@ -600,31 +613,39 @@ __intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal result.continue_search = true; result.distance = ray_tmax; - metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction, + metalrt_intersection_curve_shadow(launch_params_metal, + payload, + object, + segment.prim, + segment.type, + ray_P, + ray_D, # if defined(__METALRT_MOTION__) - payload.time, + payload.time, # else - 0.0f, + 0.0f, # endif - ray_tmin, ray_tmax, result); + ray_tmin, + ray_tmax, + result); return result; } #endif /* __HAIR__ */ #ifdef __POINTCLOUD__ -ccl_device_inline -void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal, - ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, - const uint object, - const uint prim, - const uint type, - const float3 ray_origin, - const float3 ray_direction, - float time, - const float ray_tmin, - const float ray_tmax, - thread BoundingBoxIntersectionResult &result) +ccl_device_inline void metalrt_intersection_point( + constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload, + const uint object, + const uint prim, + const uint type, + const float3 ray_P, + const float3 ray_D, + float time, + const float ray_tmin, + const float ray_tmax, + thread BoundingBoxIntersectionResult &result) { # ifdef __VISIBILITY_FLAG__ const uint visibility = payload.visibility; @@ -633,25 +654,16 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal, } # endif - float3 P = ray_origin; - float3 dir = ray_direction; - - /* The direction is not normalized by default, but the point intersection routine expects that */ - float len; - dir = normalize_len(dir, &len); - Intersection isect; isect.t = ray_tmax; - /* Transform maximum distance into object space. */ - if (isect.t != FLT_MAX) - isect.t *= len; MetalKernelContext context(launch_params_metal); - if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) { + if (context.point_intersect( + NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) { result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>( - launch_params_metal, payload, object, prim, isect.u); + launch_params_metal, payload, object, prim, isect.u); if (result.accept) { - result.distance = isect.t / len; + result.distance = isect.t; payload.u = isect.u; payload.v = isect.v; payload.prim = prim; @@ -660,56 +672,46 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal, } } -ccl_device_inline -void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params_metal, - ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload, - const uint object, - const uint prim, - const uint type, - const float3 ray_origin, - const float3 ray_direction, - float time, - const float ray_tmin, - const float ray_tmax, - thread BoundingBoxIntersectionResult &result) +ccl_device_inline void metalrt_intersection_point_shadow( + constant KernelParamsMetal &launch_params_metal, + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload, + const uint object, + const uint prim, + const uint type, + const float3 ray_P, + const float3 ray_D, + float time, + const float ray_tmin, + const float ray_tmax, + thread BoundingBoxIntersectionResult &result) { const uint visibility = payload.visibility; - float3 P = ray_origin; - float3 dir = ray_direction; - - /* The direction is not normalized by default, but the point intersection routine expects that */ - float len; - dir = normalize_len(dir, &len); - Intersection isect; isect.t = ray_tmax; - /* Transform maximum distance into object space */ - if (isect.t != FLT_MAX) - isect.t *= len; MetalKernelContext context(launch_params_metal); - if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) { + if (context.point_intersect( + NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) { result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>( - launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax); + launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax); result.accept = !result.continue_search; if (result.accept) { - result.distance = isect.t / len; + result.distance = isect.t; } } } -[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] -BoundingBoxIntersectionResult +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], - ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], - const uint object [[user_instance_id]], - const uint primitive_id [[primitive_id]], - const float3 ray_origin [[origin]], - const float3 ray_direction [[direction]], - const float ray_tmin [[min_distance]], - const float ray_tmax [[max_distance]]) + ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_origin [[origin]], + const float3 ray_direction [[direction]], + const float ray_tmin [[min_distance]], + const float ray_tmax [[max_distance]]) { const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); const int type = kernel_data_fetch(objects, object).primitive_type; @@ -719,27 +721,35 @@ __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1 result.continue_search = true; result.distance = ray_tmax; - metalrt_intersection_point(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction, + metalrt_intersection_point(launch_params_metal, + payload, + object, + prim, + type, + ray_origin, + ray_direction, # if defined(__METALRT_MOTION__) payload.time, # else 0.0f, # endif - ray_tmin, ray_tmax, result); + ray_tmin, + ray_tmax, + result); return result; } -[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] -BoundingBoxIntersectionResult +[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult __intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]], - ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]], - const uint object [[user_instance_id]], - const uint primitive_id [[primitive_id]], - const float3 ray_origin [[origin]], - const float3 ray_direction [[direction]], - const float ray_tmin [[min_distance]], - const float ray_tmax [[max_distance]]) + ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload + [[payload]], + const uint object [[user_instance_id]], + const uint primitive_id [[primitive_id]], + const float3 ray_origin [[origin]], + const float3 ray_direction [[direction]], + const float ray_tmin [[min_distance]], + const float ray_tmax [[max_distance]]) { const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object); const int type = kernel_data_fetch(objects, object).primitive_type; @@ -749,13 +759,21 @@ __intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[b result.continue_search = true; result.distance = ray_tmax; - metalrt_intersection_point_shadow(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction, + metalrt_intersection_point_shadow(launch_params_metal, + payload, + object, + prim, + type, + ray_origin, + ray_direction, # if defined(__METALRT_MOTION__) - payload.time, + payload.time, # else - 0.0f, + 0.0f, # endif - ray_tmin, ray_tmax, result); + ray_tmin, + ray_tmax, + result); return result; } diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h index 1b25259bcf5..dfaec65130c 100644 --- a/intern/cycles/kernel/device/oneapi/compat.h +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -10,6 +10,7 @@ #define CCL_NAMESPACE_END #include <cstdint> +#include <math.h> #ifndef __NODES_MAX_GROUP__ # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX @@ -30,7 +31,7 @@ #define ccl_global #define ccl_always_inline __attribute__((always_inline)) #define ccl_device_inline inline -#define ccl_noinline +#define ccl_noinline __attribute__((noinline)) #define ccl_inline_constant const constexpr #define ccl_static_constant const #define ccl_device_forceinline __attribute__((always_inline)) @@ -54,18 +55,6 @@ #define ccl_gpu_kernel(block_num_threads, thread_num_registers) #define ccl_gpu_kernel_threads(block_num_threads) -#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -# define KG_ND_ITEMS \ - kg->nd_item_local_id_0 = item.get_local_id(0); \ - kg->nd_item_local_range_0 = item.get_local_range(0); \ - kg->nd_item_group_0 = item.get_group(0); \ - kg->nd_item_group_range_0 = item.get_group_range(0); \ - kg->nd_item_global_id_0 = item.get_global_id(0); \ - kg->nd_item_global_range_0 = item.get_global_range(0); -#else -# define KG_ND_ITEMS -#endif - #define ccl_gpu_kernel_signature(name, ...) \ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ size_t kernel_global_size, \ @@ -75,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ (kg); \ cgh.parallel_for<class kernel_##name>( \ sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ - [=](sycl::nd_item<1> item) { \ - KG_ND_ITEMS + [=](sycl::nd_item<1> item) { #define ccl_gpu_kernel_postfix \ }); \ @@ -94,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg) /* GPU thread, block, grid size and index */ -#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED -# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) -# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) -# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) -# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) -# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) -# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) - -# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) -# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) -#else -# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0) -# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0) -# define ccl_gpu_block_idx_x (kg->nd_item_group_0) -# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0) -# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) -# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) - -# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0) -# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0) -#endif +#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) +#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) +#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) +#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) +#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) +#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) +#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) /* GPU warp synchronization */ - #define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier() #define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space) #ifdef __SYCL_DEVICE_ONLY__ @@ -149,25 +123,13 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ /* clang-format on */ /* Types */ + /* It's not possible to use sycl types like sycl::float3, sycl::int3, etc - * because these types have different interfaces from blender version */ + * because these types have different interfaces from blender version. */ using uchar = unsigned char; using sycl::half; -struct float3 { - float x, y, z; -}; - -ccl_always_inline float3 make_float3(float x, float y, float z) -{ - return {x, y, z}; -} -ccl_always_inline float3 make_float3(float x) -{ - return {x, x, x}; -} - /* math functions */ #define fabsf(x) sycl::fabs((x)) #define copysignf(x, y) sycl::copysign((x), (y)) @@ -186,21 +148,15 @@ ccl_always_inline float3 make_float3(float x) #define fmodf(x, y) sycl::fmod((x), (y)) #define lgammaf(x) sycl::lgamma((x)) -#define __forceinline __attribute__((always_inline)) - -/* Types */ -#include "util/half.h" -#include "util/types.h" - -/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they - * include oneAPI headers, which transitively include math.h headers which will cause redefinitions - * of the math defines because math.h also uses them and having them defined before math.h include - * is actually UB. */ -/* Use fast math functions - get them from sycl::native namespace for native math function - * implementations */ #define cosf(x) sycl::native::cos(((float)(x))) #define sinf(x) sycl::native::sin(((float)(x))) #define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y))) #define tanf(x) sycl::native::tan(((float)(x))) #define logf(x) sycl::native::log(((float)(x))) #define expf(x) sycl::native::exp(((float)(x))) + +#define __forceinline __attribute__((always_inline)) + +/* Types */ +#include "util/half.h" +#include "util/types.h" diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h deleted file mode 100644 index 662068c0fed..00000000000 --- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2022 Intel Corporation */ - -/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */ -DLL_INTERFACE_CALL(oneapi_device_capabilities, char *) -DLL_INTERFACE_CALL(oneapi_free, void, void *) -DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue) - -DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue) -DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr) -DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr) - -DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index) -DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue) -DLL_INTERFACE_CALL( - oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment) -DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size) -DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr) - -DLL_INTERFACE_CALL( - oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes) -DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue) -DLL_INTERFACE_CALL(oneapi_usm_memset, - bool, - SyclQueue *queue, - void *usm_ptr, - unsigned char value, - size_t num_bytes) - -DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue) - -/* Operation with Kernel globals structure - map of global/constant allocation - filled before - * render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as - * pointer. */ -DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size) -DLL_INTERFACE_CALL(oneapi_set_global_memory, - void, - SyclQueue *queue, - void *kernel_globals, - const char *memory_name, - void *memory_device_pointer) - -DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size, - size_t, - SyclQueue *queue, - const DeviceKernel kernel, - const size_t kernel_global_size) -DLL_INTERFACE_CALL(oneapi_enqueue_kernel, - bool, - KernelContext *context, - int kernel, - size_t global_size, - void **args) diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h index d60f4f135ba..116620eb725 100644 --- a/intern/cycles/kernel/device/oneapi/globals.h +++ b/intern/cycles/kernel/device/oneapi/globals.h @@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU { #undef KERNEL_DATA_ARRAY IntegratorStateGPU *integrator_state; const KernelData *__data; -#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - size_t nd_item_local_id_0; - size_t nd_item_local_range_0; - size_t nd_item_group_0; - size_t nd_item_group_range_0; - - size_t nd_item_global_id_0; - size_t nd_item_global_range_0; -#endif } KernelGlobalsGPU; typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals; diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h index 6681977a675..2417b8eac3b 100644 --- a/intern/cycles/kernel/device/oneapi/image.h +++ b/intern/cycles/kernel/device/oneapi/image.h @@ -81,10 +81,15 @@ ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y) x = svm_image_texture_wrap_periodic(x, info.width); y = svm_image_texture_wrap_periodic(y, info.height); } - else { + else if (info.extension == EXTENSION_EXTEND) { x = svm_image_texture_wrap_clamp(x, info.width); y = svm_image_texture_wrap_clamp(y, info.height); } + else { + if (x < 0 || x >= info.width || y < 0 || y >= info.height) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } return svm_image_texture_read(info, x, y, 0); } @@ -99,11 +104,16 @@ ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z) y = svm_image_texture_wrap_periodic(y, info.height); z = svm_image_texture_wrap_periodic(z, info.depth); } - else { + else if (info.extension == EXTENSION_EXTEND) { x = svm_image_texture_wrap_clamp(x, info.width); y = svm_image_texture_wrap_clamp(y, info.height); z = svm_image_texture_wrap_clamp(z, info.depth); } + else { + if (x < 0 || x >= info.width || y < 0 || y >= info.height || z < 0 || z >= info.depth) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } return svm_image_texture_read(info, x, y, z); } @@ -128,12 +138,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float { const TextureInfo &info = kernel_data_fetch(texture_info, id); - if (info.extension == EXTENSION_CLIP) { - if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - if (info.interpolation == INTERPOLATION_CLOSEST) { /* Closest interpolation. */ int ix, iy; @@ -315,12 +319,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, in } #endif else { - if (info.extension == EXTENSION_CLIP) { - if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - x *= info.width; y *= info.height; z *= info.depth; diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 300e201600c..525ae288f0c 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -3,208 +3,79 @@ #ifdef WITH_ONEAPI -/* clang-format off */ # include "kernel.h" # include <iostream> # include <map> # include <set> -# include <CL/sycl.hpp> +# include <sycl/sycl.hpp> # include "kernel/device/oneapi/compat.h" # include "kernel/device/oneapi/globals.h" # include "kernel/device/oneapi/kernel_templates.h" # include "kernel/device/gpu/kernel.h" -/* clang-format on */ static OneAPIErrorCallback s_error_cb = nullptr; static void *s_error_user_ptr = nullptr; -static std::vector<sycl::device> oneapi_available_devices(); - void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr) { s_error_cb = cb; s_error_user_ptr = user_ptr; } -void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false) -{ -# ifdef _DEBUG - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - sycl::info::device_type device_type = - queue->get_device().get_info<sycl::info::device::device_type>(); - sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); - (void)usm_type; - assert(usm_type == sycl::usm::alloc::device || - ((device_type == sycl::info::device_type::host || - device_type == sycl::info::device_type::is_cpu || allow_host) && - usm_type == sycl::usm::alloc::host)); -# endif -} - -bool oneapi_create_queue(SyclQueue *&external_queue, int device_index) -{ - bool finished_correct = true; - try { - std::vector<sycl::device> devices = oneapi_available_devices(); - if (device_index < 0 || device_index >= devices.size()) { - return false; - } - sycl::queue *created_queue = new sycl::queue(devices[device_index], - sycl::property::queue::in_order()); - external_queue = reinterpret_cast<SyclQueue *>(created_queue); - } - catch (sycl::exception const &e) { - finished_correct = false; - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - } - return finished_correct; -} - -void oneapi_free_queue(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - delete queue; -} - -void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - return sycl::aligned_alloc_host(alignment, memory_size, *queue); -} - -void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - return sycl::malloc_device(memory_size, *queue); -} - -void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr) +/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like + * memory allocations, memory transfers and execution of kernel with USM memory. */ +bool oneapi_run_test_kernel(SyclQueue *queue_) { assert(queue_); sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, usm_ptr, true); - sycl::free(usm_ptr, *queue); -} + const size_t N = 8; + const size_t memory_byte_size = sizeof(int) * N; -bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, dest, true); - oneapi_check_usm(queue_, src, true); - sycl::event mem_event = queue->memcpy(dest, src, num_bytes); -# ifdef WITH_CYCLES_DEBUG + bool is_computation_correct = true; try { - /* NOTE(@nsirgien) Waiting on memory operation may give more precise error - * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. - */ - mem_event.wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } -# else - sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context()); - sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context()); - bool from_device_to_host = dest_type == sycl::usm::alloc::host && - src_type == sycl::usm::alloc::device; - bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown || - src_type == sycl::usm::alloc::unknown; - /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host - * may not wait until the end of the transfer before using the memory. - */ - if (from_device_to_host || host_or_device_memop_with_offset) - mem_event.wait(); - return true; -# endif -} + int *A_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue); -bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, usm_ptr, true); - sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes); -# ifdef WITH_CYCLES_DEBUG - try { - /* NOTE(@nsirgien) Waiting on memory operation may give more precise error - * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. - */ - mem_event.wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); + for (size_t i = (size_t)0; i < N; i++) { + A_host[i] = rand() % 32; } - return false; - } -# else - (void)mem_event; - return true; -# endif -} -bool oneapi_queue_synchronize(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - try { - queue->wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } -} + int *A_device = (int *)sycl::malloc_device(memory_byte_size, *queue); + int *B_device = (int *)sycl::malloc_device(memory_byte_size, *queue); -/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and - * also trigger runtime compilation of all existing oneAPI kernels */ -bool oneapi_run_test_kernel(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - size_t N = 8; - sycl::buffer<float, 1> A(N); - sycl::buffer<float, 1> B(N); - - { - sycl::host_accessor A_host_acc(A, sycl::write_only); - for (size_t i = (size_t)0; i < N; i++) - A_host_acc[i] = rand() % 32; - } + queue->memcpy(A_device, A_host, memory_byte_size); + queue->wait_and_throw(); - try { queue->submit([&](sycl::handler &cgh) { - sycl::accessor A_acc(A, cgh, sycl::read_only); - sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init); - - cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); }); + cgh.parallel_for(N, [=](sycl::id<1> idx) { B_device[idx] = A_device[idx] + idx.get(0); }); }); queue->wait_and_throw(); - sycl::host_accessor A_host_acc(A, sycl::read_only); - sycl::host_accessor B_host_acc(B, sycl::read_only); + int *B_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue); + + queue->memcpy(B_host, B_device, memory_byte_size); + queue->wait_and_throw(); for (size_t i = (size_t)0; i < N; i++) { - float result = A_host_acc[i] + B_host_acc[i]; - (void)result; + const int expected_result = i + A_host[i]; + if (B_host[i] != expected_result) { + is_computation_correct = false; + if (s_error_cb) { + s_error_cb(("Incorrect result in test kernel execution - expected " + + std::to_string(expected_result) + ", got " + std::to_string(B_host[i])) + .c_str(), + s_error_user_ptr); + } + } } + + sycl::free(A_host, *queue); + sycl::free(B_host, *queue); + sycl::free(A_device, *queue); + sycl::free(B_device, *queue); + queue->wait_and_throw(); } catch (sycl::exception const &e) { if (s_error_cb) { @@ -213,63 +84,16 @@ bool oneapi_run_test_kernel(SyclQueue *queue_) return false; } - return true; -} - -bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size) -{ - kernel_global_size = sizeof(KernelGlobalsGPU); - - return true; -} - -void oneapi_set_global_memory(SyclQueue *queue_, - void *kernel_globals, - const char *memory_name, - void *memory_device_pointer) -{ - assert(queue_); - assert(kernel_globals); - assert(memory_name); - assert(memory_device_pointer); - KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals; - oneapi_check_usm(queue_, memory_device_pointer); - oneapi_check_usm(queue_, kernel_globals, true); - - std::string matched_name(memory_name); - -/* This macro will change global ptr of KernelGlobals via name matching. */ -# define KERNEL_DATA_ARRAY(type, name) \ - else if (#name == matched_name) \ - { \ - globals->__##name = (type *)memory_device_pointer; \ - return; \ - } - if (false) { - } - else if ("integrator_state" == matched_name) { - globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer; - return; - } - KERNEL_DATA_ARRAY(KernelData, data) -# include "kernel/data_arrays.h" - else - { - std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!" - << std::endl; - assert(false); - } -# undef KERNEL_DATA_ARRAY + return is_computation_correct; } /* TODO: Move device information to OneapiDevice initialized on creation and use it. */ /* TODO: Move below function to oneapi/queue.cpp. */ -size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, +size_t oneapi_kernel_preferred_local_size(SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size) { - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + assert(queue); (void)kernel_global_size; const static size_t preferred_work_group_size_intersect_shading = 32; const static size_t preferred_work_group_size_technical = 1024; @@ -311,11 +135,63 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, preferred_work_group_size = 512; } - const size_t limit_work_group_size = - queue->get_device().get_info<sycl::info::device::max_work_group_size>(); + const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue) + ->get_device() + .get_info<sycl::info::device::max_work_group_size>(); + return std::min(limit_work_group_size, preferred_work_group_size); } +bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features) +{ +# ifdef SYCL_SKIP_KERNELS_PRELOAD + (void)queue_; + (void)requested_features; +# else + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + + try { + sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle = + sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), + {queue->get_device()}); + + for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) { + const std::string &kernel_name = kernel_id.get_name(); + + /* NOTE(@nsirgien): Names in this conditions below should match names from + * oneapi_call macro in oneapi_enqueue_kernel below */ + if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) { + continue; + } + + if (((requested_features & KERNEL_FEATURE_MNEE) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) { + continue; + } + + if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") != + std::string::npos) { + continue; + } + + sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle = + sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id}); + sycl::build(one_kernel_bundle); + } + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +# endif + return true; +} + bool oneapi_enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, @@ -354,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices, * we extend work size to fit uniformity requirements. */ global_size = groups_count * local_size; - -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - if (queue->get_device().is_host()) { - global_size = 1; - local_size = 1; - } -# endif } /* Let the compiler throw an error if there are any kernels missing in this implementation. */ @@ -645,13 +514,9 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* Unsupported kernels */ case DEVICE_KERNEL_NUM: case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL: - assert(0); - return false; + kernel_assert(0); + break; } - - /* Unknown kernel. */ - assert(0); - return false; }); } catch (sycl::exception const &e) { @@ -668,247 +533,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, # endif return success; } - -static const int lowest_supported_driver_version_win = 1011660; -static const int lowest_supported_driver_version_neo = 23570; - -static int parse_driver_build_version(const sycl::device &device) -{ - const std::string &driver_version = device.get_info<sycl::info::device::driver_version>(); - int driver_build_version = 0; - - size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1); - if (second_dot_position == std::string::npos) { - std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version - << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," - << " xx.xx.xxx.xxxx (Windows) for device \"" - << device.get_info<sycl::info::device::name>() << "\"." << std::endl; - } - else { - try { - size_t third_dot_position = driver_version.find('.', second_dot_position + 1); - if (third_dot_position != std::string::npos) { - const std::string &third_number_substr = driver_version.substr( - second_dot_position + 1, third_dot_position - second_dot_position - 1); - const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1); - if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) - driver_build_version = std::stoi(third_number_substr) * 10000 + - std::stoi(forth_number_substr); - } - else { - const std::string &third_number_substr = driver_version.substr(second_dot_position + 1); - driver_build_version = std::stoi(third_number_substr); - } - } - catch (std::invalid_argument &e) { - std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version - << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," - << " xx.xx.xxx.xxxx (Windows) for device \"" - << device.get_info<sycl::info::device::name>() << "\"." << std::endl; - } - } - - return driver_build_version; -} - -static std::vector<sycl::device> oneapi_available_devices() -{ - bool allow_all_devices = false; - if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) - allow_all_devices = true; - - /* Host device is useful only for debugging at the moment - * so we hide this device with default build settings. */ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - bool allow_host = true; -# else - bool allow_host = false; -# endif - - const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms(); - - std::vector<sycl::device> available_devices; - for (const sycl::platform &platform : oneapi_platforms) { - /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL. - */ - if (platform.get_backend() == sycl::backend::opencl) { - continue; - } - - const std::vector<sycl::device> &oneapi_devices = - (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : - platform.get_devices(sycl::info::device_type::gpu); - - for (const sycl::device &device : oneapi_devices) { - if (allow_all_devices) { - /* still filter out host device if build doesn't support it. */ - if (allow_host || !device.is_host()) { - available_devices.push_back(device); - } - } - else { - bool filter_out = false; - - /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU, - * assuming they have either more than 96 Execution Units or not 7 threads per EU. - * Official support can be broaden to older and smaller GPUs once ready. */ - if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) { - /* Filtered-out defaults in-case these values aren't available through too old L0 - * runtime. */ - int number_of_eus = 96; - int threads_per_eu = 7; - if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { - number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); - } - if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { - threads_per_eu = - device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); - } - /* This filters out all Level-Zero supported GPUs from older generation than Arc. */ - if (number_of_eus <= 96 && threads_per_eu == 7) { - filter_out = true; - } - /* if not already filtered out, check driver version. */ - if (!filter_out) { - int driver_build_version = parse_driver_build_version(device); - if ((driver_build_version > 100000 && - driver_build_version < lowest_supported_driver_version_win) || - (driver_build_version > 0 && - driver_build_version < lowest_supported_driver_version_neo)) { - filter_out = true; - } - } - } - else if (!allow_host && device.is_host()) { - filter_out = true; - } - else if (!allow_all_devices) { - filter_out = true; - } - - if (!filter_out) { - available_devices.push_back(device); - } - } - } - } - - return available_devices; -} - -char *oneapi_device_capabilities() -{ - std::stringstream capabilities; - - const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices(); - for (const sycl::device &device : oneapi_devices) { - const std::string &name = device.get_info<sycl::info::device::name>(); - - capabilities << std::string("\t") << name << "\n"; -# define WRITE_ATTR(attribute_name, attribute_variable) \ - capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \ - << "\n"; -# define GET_NUM_ATTR(attribute) \ - { \ - size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \ - capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \ - } - - GET_NUM_ATTR(vendor_id) - GET_NUM_ATTR(max_compute_units) - GET_NUM_ATTR(max_work_item_dimensions) - - sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>(); - WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0))) - WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1))) - WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2))) - - GET_NUM_ATTR(max_work_group_size) - GET_NUM_ATTR(max_num_sub_groups) - GET_NUM_ATTR(sub_group_independent_forward_progress) - - GET_NUM_ATTR(preferred_vector_width_char) - GET_NUM_ATTR(preferred_vector_width_short) - GET_NUM_ATTR(preferred_vector_width_int) - GET_NUM_ATTR(preferred_vector_width_long) - GET_NUM_ATTR(preferred_vector_width_float) - GET_NUM_ATTR(preferred_vector_width_double) - GET_NUM_ATTR(preferred_vector_width_half) - - GET_NUM_ATTR(native_vector_width_char) - GET_NUM_ATTR(native_vector_width_short) - GET_NUM_ATTR(native_vector_width_int) - GET_NUM_ATTR(native_vector_width_long) - GET_NUM_ATTR(native_vector_width_float) - GET_NUM_ATTR(native_vector_width_double) - GET_NUM_ATTR(native_vector_width_half) - - size_t max_clock_frequency = - (size_t)(device.is_host() ? (size_t)0 : - device.get_info<sycl::info::device::max_clock_frequency>()); - WRITE_ATTR("max_clock_frequency", max_clock_frequency) - - GET_NUM_ATTR(address_bits) - GET_NUM_ATTR(max_mem_alloc_size) - - /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't - * supported so we always return false, even if device supports HW texture usage acceleration. - */ - bool image_support = false; - WRITE_ATTR("image_support", (size_t)image_support) - - GET_NUM_ATTR(max_parameter_size) - GET_NUM_ATTR(mem_base_addr_align) - GET_NUM_ATTR(global_mem_size) - GET_NUM_ATTR(local_mem_size) - GET_NUM_ATTR(error_correction_support) - GET_NUM_ATTR(profiling_timer_resolution) - GET_NUM_ATTR(is_available) - -# undef GET_NUM_ATTR -# undef WRITE_ATTR - capabilities << "\n"; - } - - return ::strdup(capabilities.str().c_str()); -} - -void oneapi_free(void *p) -{ - if (p) { - ::free(p); - } -} - -void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr) -{ - int num = 0; - std::vector<sycl::device> devices = oneapi_available_devices(); - for (sycl::device &device : devices) { - const std::string &platform_name = - device.get_platform().get_info<sycl::info::platform::name>(); - std::string name = device.get_info<sycl::info::device::name>(); - std::string id = "ONEAPI_" + platform_name + "_" + name; - if (device.has(sycl::aspect::ext_intel_pci_address)) { - id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>()); - } - (cb)(id.c_str(), name.c_str(), num, user_ptr); - num++; - } -} - -size_t oneapi_get_memcapacity(SyclQueue *queue) -{ - return reinterpret_cast<sycl::queue *>(queue) - ->get_device() - .get_info<sycl::info::device::global_mem_size>(); -} - -size_t oneapi_get_compute_units_amount(SyclQueue *queue) -{ - return reinterpret_cast<sycl::queue *>(queue) - ->get_device() - .get_info<sycl::info::device::max_compute_units>(); -} - #endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h index c5f853742ed..2bfc0b89c87 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.h +++ b/intern/cycles/kernel/device/oneapi/kernel.h @@ -25,11 +25,6 @@ enum DeviceKernel : int; class SyclQueue; -typedef void (*OneAPIDeviceIteratorCallback)(const char *id, - const char *name, - int num, - void *user_ptr); - typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr); struct KernelContext { @@ -45,13 +40,17 @@ struct KernelContext { extern "C" { # endif -# define DLL_INTERFACE_CALL(function, return_type, ...) \ - CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__); -# include "kernel/device/oneapi/dll_interface_template.h" -# undef DLL_INTERFACE_CALL - +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_); +CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr); +CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size( + SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size); +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context, + int kernel, + size_t global_size, + void **args); +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue, + const unsigned int requested_features); # ifdef __cplusplus } # endif - #endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h index d8964d9b672..0ae925cf748 100644 --- a/intern/cycles/kernel/device/oneapi/kernel_templates.h +++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h @@ -80,7 +80,7 @@ void oneapi_call( (x, ##__VA_ARGS__) /* This template automatically casts entries in the void **args array to the types requested by the kernel func. - Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */ + * Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */ #define oneapi_template(...) \ template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \ void oneapi_call( \ diff --git a/intern/cycles/kernel/device/optix/bvh.h b/intern/cycles/kernel/device/optix/bvh.h new file mode 100644 index 00000000000..6d81b44660c --- /dev/null +++ b/intern/cycles/kernel/device/optix/bvh.h @@ -0,0 +1,659 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Blender Foundation */ + +/* OptiX implementation of ray-scene intersection. */ + +#pragma once + +#include "kernel/bvh/types.h" +#include "kernel/bvh/util.h" + +#define OPTIX_DEFINE_ABI_VERSION_ONLY +#include <optix_function_table.h> + +CCL_NAMESPACE_BEGIN + +/* Utilities. */ + +template<typename T> ccl_device_forceinline T *get_payload_ptr_0() +{ + return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1()); +} +template<typename T> ccl_device_forceinline T *get_payload_ptr_2() +{ + return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3()); +} + +template<typename T> ccl_device_forceinline T *get_payload_ptr_6() +{ + return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6()); +} + +ccl_device_forceinline int get_object_id() +{ +#ifdef __OBJECT_MOTION__ + /* Always get the instance ID from the TLAS + * There might be a motion transform node between TLAS and BLAS which does not have one. */ + return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0)); +#else + return optixGetInstanceId(); +#endif +} + +/* Hit/miss functions. */ + +extern "C" __global__ void __miss__kernel_optix_miss() +{ + /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */ + optixSetPayload_0(__float_as_uint(optixGetRayTmax())); + optixSetPayload_5(PRIMITIVE_NONE); +} + +extern "C" __global__ void __anyhit__kernel_optix_local_hit() +{ +#if defined(__HAIR__) || defined(__POINTCLOUD__) + if (!optixIsTriangleHit()) { + /* Ignore curves and points. */ + return optixIgnoreIntersection(); + } +#endif + +#ifdef __BVH_LOCAL__ + const int object = get_object_id(); + if (object != optixGetPayload_4() /* local_object */) { + /* Only intersect with matching object. */ + return optixIgnoreIntersection(); + } + + const int prim = optixGetPrimitiveIndex(); + ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); + if (intersection_skip_self_local(ray->self, prim)) { + return optixIgnoreIntersection(); + } + + const uint max_hits = optixGetPayload_5(); + if (max_hits == 0) { + /* Special case for when no hit information is requested, just report that something was hit */ + optixSetPayload_5(true); + return optixTerminateRay(); + } + + int hit = 0; + uint *const lcg_state = get_payload_ptr_0<uint>(); + LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>(); + + if (lcg_state) { + for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) { + if (optixGetRayTmax() == local_isect->hits[i].t) { + return optixIgnoreIntersection(); + } + } + + hit = local_isect->num_hits++; + + if (local_isect->num_hits > max_hits) { + hit = lcg_step_uint(lcg_state) % local_isect->num_hits; + if (hit >= max_hits) { + return optixIgnoreIntersection(); + } + } + } + else { + if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) { + /* Record closest intersection only. + * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit. + */ + return optixIgnoreIntersection(); + } + + local_isect->num_hits = 1; + } + + Intersection *isect = &local_isect->hits[hit]; + isect->t = optixGetRayTmax(); + isect->prim = prim; + isect->object = get_object_id(); + isect->type = kernel_data_fetch(objects, isect->object).primitive_type; + + const float2 barycentrics = optixGetTriangleBarycentrics(); + isect->u = barycentrics.x; + isect->v = barycentrics.y; + + /* Record geometric normal. */ + const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w; + const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0); + const float3 tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1); + const float3 tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2); + local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); + + /* Continue tracing (without this the trace call would return after the first hit). */ + optixIgnoreIntersection(); +#endif +} + +extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() +{ +#ifdef __SHADOW_RECORD_ALL__ + int prim = optixGetPrimitiveIndex(); + const uint object = get_object_id(); +# ifdef __VISIBILITY_FLAG__ + const uint visibility = optixGetPayload_4(); + if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { + return optixIgnoreIntersection(); + } +# endif + + float u = 0.0f, v = 0.0f; + int type = 0; + if (optixIsTriangleHit()) { + /* Triangle. */ + const float2 barycentrics = optixGetTriangleBarycentrics(); + u = barycentrics.x; + v = barycentrics.y; + type = kernel_data_fetch(objects, object).primitive_type; + } +# ifdef __HAIR__ + else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) { + /* Curve. */ + u = __uint_as_float(optixGetAttribute_0()); + v = __uint_as_float(optixGetAttribute_1()); + + const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim); + type = segment.type; + prim = segment.prim; + +# if OPTIX_ABI_VERSION < 55 + /* Filter out curve end-caps. */ + if (u == 0.0f || u == 1.0f) { + return optixIgnoreIntersection(); + } +# endif + } +# endif + else { + /* Point. */ + type = kernel_data_fetch(objects, object).primitive_type; + u = 0.0f; + v = 0.0f; + } + + ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); + if (intersection_skip_self_shadow(ray->self, object, prim)) { + return optixIgnoreIntersection(); + } + +# ifndef __TRANSPARENT_SHADOWS__ + /* No transparent shadows support compiled in, make opaque. */ + optixSetPayload_5(true); + return optixTerminateRay(); +# else + const uint max_hits = optixGetPayload_3(); + const uint num_hits_packed = optixGetPayload_2(); + const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed); + const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed); + + /* If no transparent shadows, all light is blocked and we can stop immediately. */ + if (num_hits >= max_hits || + !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) { + optixSetPayload_5(true); + return optixTerminateRay(); + } + + /* Always use baked shadow transparency for curves. */ + if (type & PRIMITIVE_CURVE) { + float throughput = __uint_as_float(optixGetPayload_1()); + throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, type, u); + optixSetPayload_1(__float_as_uint(throughput)); + optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1)); + + if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) { + optixSetPayload_5(true); + return optixTerminateRay(); + } + else { + /* Continue tracing. */ + optixIgnoreIntersection(); + return; + } + } + + /* Record transparent intersection. */ + optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1)); + + uint record_index = num_recorded_hits; + + const IntegratorShadowState state = optixGetPayload_0(); + + const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE); + if (record_index >= max_record_hits) { + /* If maximum number of hits reached, find a hit to replace. */ + float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t); + uint max_recorded_hit = 0; + + for (int i = 1; i < max_record_hits; i++) { + const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t); + if (isect_t > max_recorded_t) { + max_recorded_t = isect_t; + max_recorded_hit = i; + } + } + + if (optixGetRayTmax() >= max_recorded_t) { + /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the + * current hit anymore. */ + return; + } + + record_index = max_recorded_hit; + } + + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax(); + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object; + INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type; + + /* Continue tracing. */ + optixIgnoreIntersection(); +# endif /* __TRANSPARENT_SHADOWS__ */ +#endif /* __SHADOW_RECORD_ALL__ */ +} + +extern "C" __global__ void __anyhit__kernel_optix_volume_test() +{ +#if defined(__HAIR__) || defined(__POINTCLOUD__) + if (!optixIsTriangleHit()) { + /* Ignore curves. */ + return optixIgnoreIntersection(); + } +#endif + + const uint object = get_object_id(); +#ifdef __VISIBILITY_FLAG__ + const uint visibility = optixGetPayload_4(); + if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { + return optixIgnoreIntersection(); + } +#endif + + if ((kernel_data_fetch(object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) { + return optixIgnoreIntersection(); + } + + const int prim = optixGetPrimitiveIndex(); + ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); + if (intersection_skip_self(ray->self, object, prim)) { + return optixIgnoreIntersection(); + } +} + +extern "C" __global__ void __anyhit__kernel_optix_visibility_test() +{ +#ifdef __HAIR__ +# if OPTIX_ABI_VERSION < 55 + if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) { + /* Filter out curve end-caps. */ + const float u = __uint_as_float(optixGetAttribute_0()); + if (u == 0.0f || u == 1.0f) { + return optixIgnoreIntersection(); + } + } +# endif +#endif + + const uint object = get_object_id(); + const uint visibility = optixGetPayload_4(); +#ifdef __VISIBILITY_FLAG__ + if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { + return optixIgnoreIntersection(); + } +#endif + + int prim = optixGetPrimitiveIndex(); + if (optixIsTriangleHit()) { + /* Triangle. */ + } +#ifdef __HAIR__ + else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) { + /* Curve. */ + prim = kernel_data_fetch(curve_segments, prim).prim; + } +#endif + + ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); + + if (visibility & PATH_RAY_SHADOW_OPAQUE) { + if (intersection_skip_self_shadow(ray->self, object, prim)) { + return optixIgnoreIntersection(); + } + else { + /* Shadow ray early termination. */ + return optixTerminateRay(); + } + } + else { + if (intersection_skip_self(ray->self, object, prim)) { + return optixIgnoreIntersection(); + } + } +} + +extern "C" __global__ void __closesthit__kernel_optix_hit() +{ + const int object = get_object_id(); + const int prim = optixGetPrimitiveIndex(); + + optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */ + optixSetPayload_4(object); + + if (optixIsTriangleHit()) { + const float2 barycentrics = optixGetTriangleBarycentrics(); + optixSetPayload_1(__float_as_uint(barycentrics.x)); + optixSetPayload_2(__float_as_uint(barycentrics.y)); + optixSetPayload_3(prim); + optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type); + } + else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) { + const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim); + optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */ + optixSetPayload_2(optixGetAttribute_1()); + optixSetPayload_3(segment.prim); + optixSetPayload_5(segment.type); + } + else { + optixSetPayload_1(0); + optixSetPayload_2(0); + optixSetPayload_3(prim); + optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type); + } +} + +/* Custom primitive intersection functions. */ + +#ifdef __HAIR__ +ccl_device_inline void optix_intersection_curve(const int prim, const int type) +{ + const int object = get_object_id(); + +# ifdef __VISIBILITY_FLAG__ + const uint visibility = optixGetPayload_4(); + if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { + return; + } +# endif + + const float3 ray_P = optixGetObjectRayOrigin(); + const float3 ray_D = optixGetObjectRayDirection(); + const float ray_tmin = optixGetRayTmin(); + +# ifdef __OBJECT_MOTION__ + const float time = optixGetRayTime(); +# else + const float time = 0.0f; +# endif + + Intersection isect; + isect.t = optixGetRayTmax(); + + if (curve_intersect(NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) { + static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use"); + optixReportIntersection(isect.t, + type & PRIMITIVE_ALL, + __float_as_int(isect.u), /* Attribute_0 */ + __float_as_int(isect.v)); /* Attribute_1 */ + } +} + +extern "C" __global__ void __intersection__curve_ribbon() +{ + const KernelCurveSegment segment = kernel_data_fetch(curve_segments, optixGetPrimitiveIndex()); + const int prim = segment.prim; + const int type = segment.type; + if (type & PRIMITIVE_CURVE_RIBBON) { + optix_intersection_curve(prim, type); + } +} + +#endif + +#ifdef __POINTCLOUD__ +extern "C" __global__ void __intersection__point() +{ + const int prim = optixGetPrimitiveIndex(); + const int object = get_object_id(); + const int type = kernel_data_fetch(objects, object).primitive_type; + +# ifdef __VISIBILITY_FLAG__ + const uint visibility = optixGetPayload_4(); + if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { + return; + } +# endif + + const float3 ray_P = optixGetObjectRayOrigin(); + const float3 ray_D = optixGetObjectRayDirection(); + const float ray_tmin = optixGetRayTmin(); + +# ifdef __OBJECT_MOTION__ + const float time = optixGetRayTime(); +# else + const float time = 0.0f; +# endif + + Intersection isect; + isect.t = optixGetRayTmax(); + + if (point_intersect(NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) { + static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use"); + optixReportIntersection(isect.t, type & PRIMITIVE_ALL); + } +} +#endif + +/* Scene intersection. */ + +ccl_device_intersect bool scene_intersect(KernelGlobals kg, + ccl_private const Ray *ray, + const uint visibility, + ccl_private Intersection *isect) +{ + uint p0 = 0; + uint p1 = 0; + uint p2 = 0; + uint p3 = 0; + uint p4 = visibility; + uint p5 = PRIMITIVE_NONE; + uint p6 = ((uint64_t)ray) & 0xFFFFFFFF; + uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF; + + uint ray_mask = visibility & 0xFF; + uint ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT; + if (0 == ray_mask && (visibility & ~0xFF) != 0) { + ray_mask = 0xFF; + } + else if (visibility & PATH_RAY_SHADOW_OPAQUE) { + ray_flags |= OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT; + } + + optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0, + ray->P, + ray->D, + ray->tmin, + ray->tmax, + ray->time, + ray_mask, + ray_flags, + 0, /* SBT offset for PG_HITD */ + 0, + 0, + p0, + p1, + p2, + p3, + p4, + p5, + p6, + p7); + + isect->t = __uint_as_float(p0); + isect->u = __uint_as_float(p1); + isect->v = __uint_as_float(p2); + isect->prim = p3; + isect->object = p4; + isect->type = p5; + + return p5 != PRIMITIVE_NONE; +} + +#ifdef __BVH_LOCAL__ +ccl_device_intersect bool scene_intersect_local(KernelGlobals kg, + ccl_private const Ray *ray, + ccl_private LocalIntersection *local_isect, + int local_object, + ccl_private uint *lcg_state, + int max_hits) +{ + uint p0 = pointer_pack_to_uint_0(lcg_state); + uint p1 = pointer_pack_to_uint_1(lcg_state); + uint p2 = pointer_pack_to_uint_0(local_isect); + uint p3 = pointer_pack_to_uint_1(local_isect); + uint p4 = local_object; + uint p6 = ((uint64_t)ray) & 0xFFFFFFFF; + uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF; + + /* Is set to zero on miss or if ray is aborted, so can be used as return value. */ + uint p5 = max_hits; + + if (local_isect) { + local_isect->num_hits = 0; /* Initialize hit count to zero. */ + } + optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0, + ray->P, + ray->D, + ray->tmin, + ray->tmax, + ray->time, + 0xFF, + /* Need to always call into __anyhit__kernel_optix_local_hit. */ + OPTIX_RAY_FLAG_ENFORCE_ANYHIT, + 2, /* SBT offset for PG_HITL */ + 0, + 0, + p0, + p1, + p2, + p3, + p4, + p5, + p6, + p7); + + return p5; +} +#endif + +#ifdef __SHADOW_RECORD_ALL__ +ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg, + IntegratorShadowState state, + ccl_private const Ray *ray, + uint visibility, + uint max_hits, + ccl_private uint *num_recorded_hits, + ccl_private float *throughput) +{ + uint p0 = state; + uint p1 = __float_as_uint(1.0f); /* Throughput. */ + uint p2 = 0; /* Number of hits. */ + uint p3 = max_hits; + uint p4 = visibility; + uint p5 = false; + uint p6 = ((uint64_t)ray) & 0xFFFFFFFF; + uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF; + + uint ray_mask = visibility & 0xFF; + if (0 == ray_mask && (visibility & ~0xFF) != 0) { + ray_mask = 0xFF; + } + + optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0, + ray->P, + ray->D, + ray->tmin, + ray->tmax, + ray->time, + ray_mask, + /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */ + OPTIX_RAY_FLAG_ENFORCE_ANYHIT, + 1, /* SBT offset for PG_HITS */ + 0, + 0, + p0, + p1, + p2, + p3, + p4, + p5, + p6, + p7); + + *num_recorded_hits = uint16_unpack_from_uint_0(p2); + *throughput = __uint_as_float(p1); + + return p5; +} +#endif + +#ifdef __VOLUME__ +ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg, + ccl_private const Ray *ray, + ccl_private Intersection *isect, + const uint visibility) +{ + uint p0 = 0; + uint p1 = 0; + uint p2 = 0; + uint p3 = 0; + uint p4 = visibility; + uint p5 = PRIMITIVE_NONE; + uint p6 = ((uint64_t)ray) & 0xFFFFFFFF; + uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF; + + uint ray_mask = visibility & 0xFF; + if (0 == ray_mask && (visibility & ~0xFF) != 0) { + ray_mask = 0xFF; + } + + optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0, + ray->P, + ray->D, + ray->tmin, + ray->tmax, + ray->time, + ray_mask, + /* Need to always call into __anyhit__kernel_optix_volume_test. */ + OPTIX_RAY_FLAG_ENFORCE_ANYHIT, + 3, /* SBT offset for PG_HITV */ + 0, + 0, + p0, + p1, + p2, + p3, + p4, + p5, + p6, + p7); + + isect->t = __uint_as_float(p0); + isect->u = __uint_as_float(p1); + isect->v = __uint_as_float(p2); + isect->prim = p3; + isect->object = p4; + isect->type = p5; + + return p5 != PRIMITIVE_NONE; +} +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h index aa4a6321a8b..1a11a533b7e 100644 --- a/intern/cycles/kernel/device/optix/compat.h +++ b/intern/cycles/kernel/device/optix/compat.h @@ -8,7 +8,6 @@ #include <optix.h> #define __KERNEL_GPU__ -#define __KERNEL_GPU_RAYTRACING__ #define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */ #define __KERNEL_OPTIX__ #define CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu index 510f7cca5d6..6abb5aeacb9 100644 --- a/intern/cycles/kernel/device/optix/kernel.cu +++ b/intern/cycles/kernel/device/optix/kernel.cu @@ -20,34 +20,6 @@ #include "kernel/integrator/intersect_volume_stack.h" // clang-format on -#define OPTIX_DEFINE_ABI_VERSION_ONLY -#include <optix_function_table.h> - -template<typename T> ccl_device_forceinline T *get_payload_ptr_0() -{ - return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1()); -} -template<typename T> ccl_device_forceinline T *get_payload_ptr_2() -{ - return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3()); -} - -template<typename T> ccl_device_forceinline T *get_payload_ptr_6() -{ - return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6()); -} - -ccl_device_forceinline int get_object_id() -{ -#ifdef __OBJECT_MOTION__ - /* Always get the instance ID from the TLAS - * There might be a motion transform node between TLAS and BLAS which does not have one. */ - return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0)); -#else - return optixGetInstanceId(); -#endif -} - extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest() { const int global_index = optixGetLaunchIndex().x; @@ -84,411 +56,3 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_st integrator_intersect_volume_stack(nullptr, path_index); } -extern "C" __global__ void __miss__kernel_optix_miss() -{ - /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */ - optixSetPayload_0(__float_as_uint(optixGetRayTmax())); - optixSetPayload_5(PRIMITIVE_NONE); -} - -extern "C" __global__ void __anyhit__kernel_optix_local_hit() -{ -#if defined(__HAIR__) || defined(__POINTCLOUD__) - if (!optixIsTriangleHit()) { - /* Ignore curves and points. */ - return optixIgnoreIntersection(); - } -#endif - -#ifdef __BVH_LOCAL__ - const int object = get_object_id(); - if (object != optixGetPayload_4() /* local_object */) { - /* Only intersect with matching object. */ - return optixIgnoreIntersection(); - } - - const int prim = optixGetPrimitiveIndex(); - ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); - if (intersection_skip_self_local(ray->self, prim)) { - return optixIgnoreIntersection(); - } - - const uint max_hits = optixGetPayload_5(); - if (max_hits == 0) { - /* Special case for when no hit information is requested, just report that something was hit */ - optixSetPayload_5(true); - return optixTerminateRay(); - } - - int hit = 0; - uint *const lcg_state = get_payload_ptr_0<uint>(); - LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>(); - - if (lcg_state) { - for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) { - if (optixGetRayTmax() == local_isect->hits[i].t) { - return optixIgnoreIntersection(); - } - } - - hit = local_isect->num_hits++; - - if (local_isect->num_hits > max_hits) { - hit = lcg_step_uint(lcg_state) % local_isect->num_hits; - if (hit >= max_hits) { - return optixIgnoreIntersection(); - } - } - } - else { - if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) { - /* Record closest intersection only. - * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit. - */ - return optixIgnoreIntersection(); - } - - local_isect->num_hits = 1; - } - - Intersection *isect = &local_isect->hits[hit]; - isect->t = optixGetRayTmax(); - isect->prim = prim; - isect->object = get_object_id(); - isect->type = kernel_data_fetch(objects, isect->object).primitive_type; - - const float2 barycentrics = optixGetTriangleBarycentrics(); - isect->u = 1.0f - barycentrics.y - barycentrics.x; - isect->v = barycentrics.x; - - /* Record geometric normal. */ - const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w; - const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0); - const float3 tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1); - const float3 tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2); - local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); - - /* Continue tracing (without this the trace call would return after the first hit). */ - optixIgnoreIntersection(); -#endif -} - -extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() -{ -#ifdef __SHADOW_RECORD_ALL__ - int prim = optixGetPrimitiveIndex(); - const uint object = get_object_id(); -# ifdef __VISIBILITY_FLAG__ - const uint visibility = optixGetPayload_4(); - if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { - return optixIgnoreIntersection(); - } -# endif - - ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); - if (intersection_skip_self_shadow(ray->self, object, prim)) { - return optixIgnoreIntersection(); - } - - float u = 0.0f, v = 0.0f; - int type = 0; - if (optixIsTriangleHit()) { - const float2 barycentrics = optixGetTriangleBarycentrics(); - u = 1.0f - barycentrics.y - barycentrics.x; - v = barycentrics.x; - type = kernel_data_fetch(objects, object).primitive_type; - } -# ifdef __HAIR__ - else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) { - u = __uint_as_float(optixGetAttribute_0()); - v = __uint_as_float(optixGetAttribute_1()); - - const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim); - type = segment.type; - prim = segment.prim; - -# if OPTIX_ABI_VERSION < 55 - /* Filter out curve endcaps. */ - if (u == 0.0f || u == 1.0f) { - return optixIgnoreIntersection(); - } -# endif - } -# endif - else { - type = kernel_data_fetch(objects, object).primitive_type; - u = 0.0f; - v = 0.0f; - } - -# ifndef __TRANSPARENT_SHADOWS__ - /* No transparent shadows support compiled in, make opaque. */ - optixSetPayload_5(true); - return optixTerminateRay(); -# else - const uint max_hits = optixGetPayload_3(); - const uint num_hits_packed = optixGetPayload_2(); - const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed); - const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed); - - /* If no transparent shadows, all light is blocked and we can stop immediately. */ - if (num_hits >= max_hits || - !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) { - optixSetPayload_5(true); - return optixTerminateRay(); - } - - /* Always use baked shadow transparency for curves. */ - if (type & PRIMITIVE_CURVE) { - float throughput = __uint_as_float(optixGetPayload_1()); - throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u); - optixSetPayload_1(__float_as_uint(throughput)); - optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1)); - - if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) { - optixSetPayload_5(true); - return optixTerminateRay(); - } - else { - /* Continue tracing. */ - optixIgnoreIntersection(); - return; - } - } - - /* Record transparent intersection. */ - optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1)); - - uint record_index = num_recorded_hits; - - const IntegratorShadowState state = optixGetPayload_0(); - - const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE); - if (record_index >= max_record_hits) { - /* If maximum number of hits reached, find a hit to replace. */ - float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t); - uint max_recorded_hit = 0; - - for (int i = 1; i < max_record_hits; i++) { - const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t); - if (isect_t > max_recorded_t) { - max_recorded_t = isect_t; - max_recorded_hit = i; - } - } - - if (optixGetRayTmax() >= max_recorded_t) { - /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the - * current hit anymore. */ - return; - } - - record_index = max_recorded_hit; - } - - INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u; - INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v; - INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax(); - INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim; - INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object; - INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type; - - /* Continue tracing. */ - optixIgnoreIntersection(); -# endif /* __TRANSPARENT_SHADOWS__ */ -#endif /* __SHADOW_RECORD_ALL__ */ -} - -extern "C" __global__ void __anyhit__kernel_optix_volume_test() -{ -#if defined(__HAIR__) || defined(__POINTCLOUD__) - if (!optixIsTriangleHit()) { - /* Ignore curves. */ - return optixIgnoreIntersection(); - } -#endif - - const uint object = get_object_id(); -#ifdef __VISIBILITY_FLAG__ - const uint visibility = optixGetPayload_4(); - if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { - return optixIgnoreIntersection(); - } -#endif - - if ((kernel_data_fetch(object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) { - return optixIgnoreIntersection(); - } - - const int prim = optixGetPrimitiveIndex(); - ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); - if (intersection_skip_self(ray->self, object, prim)) { - return optixIgnoreIntersection(); - } -} - -extern "C" __global__ void __anyhit__kernel_optix_visibility_test() -{ -#ifdef __HAIR__ -# if OPTIX_ABI_VERSION < 55 - if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) { - /* Filter out curve endcaps. */ - const float u = __uint_as_float(optixGetAttribute_0()); - if (u == 0.0f || u == 1.0f) { - return optixIgnoreIntersection(); - } - } -# endif -#endif - - const uint object = get_object_id(); - const uint visibility = optixGetPayload_4(); -#ifdef __VISIBILITY_FLAG__ - if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { - return optixIgnoreIntersection(); - } -#endif - - const int prim = optixGetPrimitiveIndex(); - ccl_private Ray *const ray = get_payload_ptr_6<Ray>(); - - if (visibility & PATH_RAY_SHADOW_OPAQUE) { - if (intersection_skip_self_shadow(ray->self, object, prim)) { - return optixIgnoreIntersection(); - } - else { - /* Shadow ray early termination. */ - return optixTerminateRay(); - } - } - else { - if (intersection_skip_self(ray->self, object, prim)) { - return optixIgnoreIntersection(); - } - } -} - -extern "C" __global__ void __closesthit__kernel_optix_hit() -{ - const int object = get_object_id(); - const int prim = optixGetPrimitiveIndex(); - - optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */ - optixSetPayload_4(object); - - if (optixIsTriangleHit()) { - const float2 barycentrics = optixGetTriangleBarycentrics(); - optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x)); - optixSetPayload_2(__float_as_uint(barycentrics.x)); - optixSetPayload_3(prim); - optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type); - } - else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) { - const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim); - optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */ - optixSetPayload_2(optixGetAttribute_1()); - optixSetPayload_3(segment.prim); - optixSetPayload_5(segment.type); - } - else { - optixSetPayload_1(0); - optixSetPayload_2(0); - optixSetPayload_3(prim); - optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type); - } -} - -#ifdef __HAIR__ -ccl_device_inline void optix_intersection_curve(const int prim, const int type) -{ - const int object = get_object_id(); - -# ifdef __VISIBILITY_FLAG__ - const uint visibility = optixGetPayload_4(); - if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { - return; - } -# endif - - float3 P = optixGetObjectRayOrigin(); - float3 dir = optixGetObjectRayDirection(); - float tmin = optixGetRayTmin(); - - /* The direction is not normalized by default, but the curve intersection routine expects that */ - float len; - dir = normalize_len(dir, &len); - -# ifdef __OBJECT_MOTION__ - const float time = optixGetRayTime(); -# else - const float time = 0.0f; -# endif - - Intersection isect; - isect.t = optixGetRayTmax(); - /* Transform maximum distance into object space. */ - if (isect.t != FLT_MAX) - isect.t *= len; - - if (curve_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) { - static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use"); - optixReportIntersection(isect.t / len, - type & PRIMITIVE_ALL, - __float_as_int(isect.u), /* Attribute_0 */ - __float_as_int(isect.v)); /* Attribute_1 */ - } -} - -extern "C" __global__ void __intersection__curve_ribbon() -{ - const KernelCurveSegment segment = kernel_data_fetch(curve_segments, optixGetPrimitiveIndex()); - const int prim = segment.prim; - const int type = segment.type; - if (type & PRIMITIVE_CURVE_RIBBON) { - optix_intersection_curve(prim, type); - } -} - -#endif - -#ifdef __POINTCLOUD__ -extern "C" __global__ void __intersection__point() -{ - const int prim = optixGetPrimitiveIndex(); - const int object = get_object_id(); - const int type = kernel_data_fetch(objects, object).primitive_type; - -# ifdef __VISIBILITY_FLAG__ - const uint visibility = optixGetPayload_4(); - if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) { - return; - } -# endif - - float3 P = optixGetObjectRayOrigin(); - float3 dir = optixGetObjectRayDirection(); - float tmin = optixGetRayTmin(); - - /* The direction is not normalized by default, the point intersection routine expects that. */ - float len; - dir = normalize_len(dir, &len); - -# ifdef __OBJECT_MOTION__ - const float time = optixGetRayTime(); -# else - const float time = 0.0f; -# endif - - Intersection isect; - isect.t = optixGetRayTmax(); - /* Transform maximum distance into object space. */ - if (isect.t != FLT_MAX) { - isect.t *= len; - } - - if (point_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) { - static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use"); - optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL); - } -} -#endif |