diff options
Diffstat (limited to 'intern/cycles/kernel/bvh/bvh_nodes.h')
-rw-r--r-- | intern/cycles/kernel/bvh/bvh_nodes.h | 899 |
1 files changed, 413 insertions, 486 deletions
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index 060b3934a41..042630121c8 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -20,12 +20,12 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k int node_addr, int child) { - Transform space; - const int child_addr = node_addr + child * 3; - space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1); - space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2); - space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3); - return space; + Transform space; + const int child_addr = node_addr + child * 3; + space.x = kernel_tex_fetch(__bvh_nodes, child_addr + 1); + space.y = kernel_tex_fetch(__bvh_nodes, child_addr + 2); + space.z = kernel_tex_fetch(__bvh_nodes, child_addr + 3); + return space; } #if !defined(__KERNEL_SSE2__) @@ -38,42 +38,41 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float dist[2]) { - /* fetch node data */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1); - float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2); - float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3); - - /* intersect ray against child nodes */ - float c0lox = (node0.x - P.x) * idir.x; - float c0hix = (node0.z - P.x) * idir.x; - float c0loy = (node1.x - P.y) * idir.y; - float c0hiy = (node1.z - P.y) * idir.y; - float c0loz = (node2.x - P.z) * idir.z; - float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); - float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); - - float c1lox = (node0.y - P.x) * idir.x; - float c1hix = (node0.w - P.x) * idir.x; - float c1loy = (node1.y - P.y) * idir.y; - float c1hiy = (node1.w - P.y) * idir.y; - float c1loz = (node2.y - P.z) * idir.z; - float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); - float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); - - dist[0] = c0min; - dist[1] = c1min; - -#ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); -#else - return ((c0max >= c0min)? 1: 0) | - ((c1max >= c1min)? 2: 0); -#endif + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); + + dist[0] = c0min; + dist[1] = c1min; + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); +# else + return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0); +# endif } ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, @@ -87,118 +86,115 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float dist[2]) { - /* fetch node data */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr+1); - float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr+2); - float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr+3); - - /* intersect ray against child nodes */ - float c0lox = (node0.x - P.x) * idir.x; - float c0hix = (node0.z - P.x) * idir.x; - float c0loy = (node1.x - P.y) * idir.y; - float c0hiy = (node1.z - P.y) * idir.y; - float c0loz = (node2.x - P.z) * idir.z; - float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); - float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); - - float c1lox = (node0.y - P.x) * idir.x; - float c1hix = (node0.w - P.x) * idir.x; - float c1loy = (node1.y - P.y) * idir.y; - float c1hiy = (node1.w - P.y) * idir.y; - float c1loz = (node2.y - P.z) * idir.z; - float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); - float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); - - if(difl != 0.0f) { - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.z) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.w) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } - } - - dist[0] = c0min; - dist[1] = c1min; - -#ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); -#else - return ((c0max >= c0min)? 1: 0) | - ((c1max >= c1min)? 2: 0); -#endif + /* fetch node data */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1); + float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2); + float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3); + + /* intersect ray against child nodes */ + float c0lox = (node0.x - P.x) * idir.x; + float c0hix = (node0.z - P.x) * idir.x; + float c0loy = (node1.x - P.y) * idir.y; + float c0hiy = (node1.z - P.y) * idir.y; + float c0loz = (node2.x - P.z) * idir.z; + float c0hiz = (node2.z - P.z) * idir.z; + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); + + float c1lox = (node0.y - P.x) * idir.x; + float c1hix = (node0.w - P.x) * idir.x; + float c1loy = (node1.y - P.y) * idir.y; + float c1hiy = (node1.w - P.y) * idir.y; + float c1loz = (node2.y - P.z) * idir.z; + float c1hiz = (node2.w - P.z) * idir.z; + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); + + if (difl != 0.0f) { + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if (__float_as_int(cnodes.z) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if (__float_as_int(cnodes.w) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + dist[0] = c0min; + dist[1] = c1min; + +# ifdef __VISIBILITY_FLAG__ + /* this visibility test gives a 5% performance hit, how to solve? */ + return (((c0max >= c0min) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((c1max >= c1min) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); +# else + return ((c0max >= c0min) ? 1 : 0) | ((c1max >= c1min) ? 2 : 0); +# endif } -ccl_device_forceinline bool bvh_unaligned_node_intersect_child( - KernelGlobals *kg, - const float3 P, - const float3 dir, - const float t, - int node_addr, - int child, - float dist[2]) +ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + int node_addr, + int child, + float dist[2]) { - Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); - float3 aligned_dir = transform_direction(&space, dir); - float3 aligned_P = transform_point(&space, P); - float3 nrdir = -bvh_inverse_direction(aligned_dir); - float3 lower_xyz = aligned_P * nrdir; - float3 upper_xyz = lower_xyz - nrdir; - const float near_x = min(lower_xyz.x, upper_xyz.x); - const float near_y = min(lower_xyz.y, upper_xyz.y); - const float near_z = min(lower_xyz.z, upper_xyz.z); - const float far_x = max(lower_xyz.x, upper_xyz.x); - const float far_y = max(lower_xyz.y, upper_xyz.y); - const float far_z = max(lower_xyz.z, upper_xyz.z); - const float tnear = max4(0.0f, near_x, near_y, near_z); - const float tfar = min4(t, far_x, far_y, far_z); - *dist = tnear; - return tnear <= tfar; + Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 lower_xyz = aligned_P * nrdir; + float3 upper_xyz = lower_xyz - nrdir; + const float near_x = min(lower_xyz.x, upper_xyz.x); + const float near_y = min(lower_xyz.y, upper_xyz.y); + const float near_z = min(lower_xyz.z, upper_xyz.z); + const float far_x = max(lower_xyz.x, upper_xyz.x); + const float far_y = max(lower_xyz.y, upper_xyz.y); + const float far_z = max(lower_xyz.z, upper_xyz.z); + const float tnear = max4(0.0f, near_x, near_y, near_z); + const float tfar = min4(t, far_x, far_y, far_z); + *dist = tnear; + return tnear <= tfar; } -ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust( - KernelGlobals *kg, - const float3 P, - const float3 dir, - const float t, - const float difl, - int node_addr, - int child, - float dist[2]) +ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(KernelGlobals *kg, + const float3 P, + const float3 dir, + const float t, + const float difl, + int node_addr, + int child, + float dist[2]) { - Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); - float3 aligned_dir = transform_direction(&space, dir); - float3 aligned_P = transform_point(&space, P); - float3 nrdir = -bvh_inverse_direction(aligned_dir); - float3 tLowerXYZ = aligned_P * nrdir; - float3 tUpperXYZ = tLowerXYZ - nrdir; - const float near_x = min(tLowerXYZ.x, tUpperXYZ.x); - const float near_y = min(tLowerXYZ.y, tUpperXYZ.y); - const float near_z = min(tLowerXYZ.z, tUpperXYZ.z); - const float far_x = max(tLowerXYZ.x, tUpperXYZ.x); - const float far_y = max(tLowerXYZ.y, tUpperXYZ.y); - const float far_z = max(tLowerXYZ.z, tUpperXYZ.z); - const float tnear = max4(0.0f, near_x, near_y, near_z); - const float tfar = min4(t, far_x, far_y, far_z); - *dist = tnear; - if(difl != 0.0f) { - /* TODO(sergey): Same as for QBVH, needs a proper use. */ - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - return round_down*tnear <= round_up*tfar; - } - else { - return tnear <= tfar; - } + Transform space = bvh_unaligned_node_fetch_space(kg, node_addr, child); + float3 aligned_dir = transform_direction(&space, dir); + float3 aligned_P = transform_point(&space, P); + float3 nrdir = -bvh_inverse_direction(aligned_dir); + float3 tLowerXYZ = aligned_P * nrdir; + float3 tUpperXYZ = tLowerXYZ - nrdir; + const float near_x = min(tLowerXYZ.x, tUpperXYZ.x); + const float near_y = min(tLowerXYZ.y, tUpperXYZ.y); + const float near_z = min(tLowerXYZ.z, tUpperXYZ.z); + const float far_x = max(tLowerXYZ.x, tUpperXYZ.x); + const float far_y = max(tLowerXYZ.y, tUpperXYZ.y); + const float far_z = max(tLowerXYZ.z, tUpperXYZ.z); + const float tnear = max4(0.0f, near_x, near_y, near_z); + const float tfar = min4(t, far_x, far_y, far_z); + *dist = tnear; + if (difl != 0.0f) { + /* TODO(sergey): Same as for QBVH, needs a proper use. */ + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + return round_down * tnear <= round_up * tfar; + } + else { + return tnear <= tfar; + } } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, @@ -210,25 +206,25 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, const uint visibility, float dist[2]) { - int mask = 0; - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) { -#ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(cnodes.x) & visibility)) -#endif - { - mask |= 1; - } - } - if(bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) { -#ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(cnodes.y) & visibility)) -#endif - { - mask |= 2; - } - } - return mask; + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) { +# ifdef __VISIBILITY_FLAG__ + if ((__float_as_uint(cnodes.x) & visibility)) +# endif + { + mask |= 1; + } + } + if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) { +# ifdef __VISIBILITY_FLAG__ + if ((__float_as_uint(cnodes.y) & visibility)) +# endif + { + mask |= 2; + } + } + return mask; } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, @@ -242,25 +238,25 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg const uint visibility, float dist[2]) { - int mask = 0; - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) { -#ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(cnodes.x) & visibility)) -#endif - { - mask |= 1; - } - } - if(bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) { -#ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(cnodes.y) & visibility)) -#endif - { - mask |= 2; - } - } - return mask; + int mask = 0; + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + if (bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 0, &dist[0])) { +# ifdef __VISIBILITY_FLAG__ + if ((__float_as_uint(cnodes.x) & visibility)) +# endif + { + mask |= 1; + } + } + if (bvh_unaligned_node_intersect_child_robust(kg, P, dir, t, difl, node_addr, 1, &dist[1])) { +# ifdef __VISIBILITY_FLAG__ + if ((__float_as_uint(cnodes.y) & visibility)) +# endif + { + mask |= 2; + } + } + return mask; } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, @@ -272,26 +268,13 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, const uint visibility, float dist[2]) { - float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect(kg, - P, - dir, - idir, - t, - node_addr, - visibility, - dist); - } - else { - return bvh_aligned_node_intersect(kg, - P, - idir, - t, - node_addr, - visibility, - dist); - } + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect(kg, P, dir, idir, t, node_addr, visibility, dist); + } + else { + return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist); + } } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, @@ -305,279 +288,244 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, const uint visibility, float dist[2]) { - float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect_robust(kg, - P, - dir, - idir, - t, - difl, - extmax, - node_addr, - visibility, - dist); - } - else { - return bvh_aligned_node_intersect_robust(kg, - P, - idir, - t, - difl, - extmax, - node_addr, - visibility, - dist); - } + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust( + kg, P, dir, idir, t, difl, extmax, node_addr, visibility, dist); + } + else { + return bvh_aligned_node_intersect_robust( + kg, P, idir, t, difl, extmax, node_addr, visibility, dist); + } } -#else /* !defined(__KERNEL_SSE2__) */ - -int ccl_device_forceinline bvh_aligned_node_intersect( - KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const int node_addr, - const uint visibility, - float dist[2]) +#else /* !defined(__KERNEL_SSE2__) */ + +int ccl_device_forceinline bvh_aligned_node_intersect(KernelGlobals *kg, + const float3 &P, + const float3 &dir, + const ssef &tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int node_addr, + const uint visibility, + float dist[2]) { - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + node_addr; + /* fetch node data */ + const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + node_addr; - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - dist[0] = tminmax[0]; - dist[1] = tminmax[1]; + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; - int mask = movemask(lrhit); + int mask = movemask(lrhit); # ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); - return cmask; + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); + return cmask; # else - return mask & 3; + return mask & 3; # endif } -ccl_device_forceinline int bvh_aligned_node_intersect_robust( - KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const float difl, - const float extmax, - const int nodeAddr, - const uint visibility, - float dist[2]) +ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, + const float3 &P, + const float3 &dir, + const ssef &tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int nodeAddr, + const uint visibility, + float dist[2]) { - /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ - const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); - - /* fetch node data */ - const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr; - - /* intersect ray against child nodes */ - const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; - const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; - const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; - - /* calculate { c0min, c1min, -c0max, -c1max} */ - ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); - const ssef tminmax = minmax ^ pn; - - if(difl != 0.0f) { - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); - float4 *tminmaxview = (float4*)&tminmax; - float& c0min = tminmaxview->x, &c1min = tminmaxview->y; - float& c0max = tminmaxview->z, &c1max = tminmaxview->w; - float hdiff = 1.0f + difl; - float ldiff = 1.0f - difl; - if(__float_as_int(cnodes.x) & PATH_RAY_CURVE) { - c0min = max(ldiff * c0min, c0min - extmax); - c0max = min(hdiff * c0max, c0max + extmax); - } - if(__float_as_int(cnodes.y) & PATH_RAY_CURVE) { - c1min = max(ldiff * c1min, c1min - extmax); - c1max = min(hdiff * c1max, c1max + extmax); - } - } - - const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); - - dist[0] = tminmax[0]; - dist[1] = tminmax[1]; - - int mask = movemask(lrhit); + /* Intersect two child bounding boxes, SSE3 version adapted from Embree */ + const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000)); + + /* fetch node data */ + const ssef *bvh_nodes = (ssef *)kg->__bvh_nodes.data + nodeAddr; + + /* intersect ray against child nodes */ + const ssef tminmaxx = (shuffle_swap(bvh_nodes[1], shufflexyz[0]) - Psplat[0]) * idirsplat[0]; + const ssef tminmaxy = (shuffle_swap(bvh_nodes[2], shufflexyz[1]) - Psplat[1]) * idirsplat[1]; + const ssef tminmaxz = (shuffle_swap(bvh_nodes[3], shufflexyz[2]) - Psplat[2]) * idirsplat[2]; + + /* calculate { c0min, c1min, -c0max, -c1max} */ + ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat)); + const ssef tminmax = minmax ^ pn; + + if (difl != 0.0f) { + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr + 0); + float4 *tminmaxview = (float4 *)&tminmax; + float &c0min = tminmaxview->x, &c1min = tminmaxview->y; + float &c0max = tminmaxview->z, &c1max = tminmaxview->w; + float hdiff = 1.0f + difl; + float ldiff = 1.0f - difl; + if (__float_as_int(cnodes.x) & PATH_RAY_CURVE) { + c0min = max(ldiff * c0min, c0min - extmax); + c0max = min(hdiff * c0max, c0max + extmax); + } + if (__float_as_int(cnodes.y) & PATH_RAY_CURVE) { + c1min = max(ldiff * c1min, c1min - extmax); + c1max = min(hdiff * c1max, c1max + extmax); + } + } + + const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax); + + dist[0] = tminmax[0]; + dist[1] = tminmax[1]; + + int mask = movemask(lrhit); # ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr+0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); - return cmask; + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr + 0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); + return cmask; # else - return mask & 3; + return mask & 3; # endif } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, const float3 P, const float3 dir, - const ssef& isect_near, - const ssef& isect_far, + const ssef &isect_near, + const ssef &isect_far, const int node_addr, const uint visibility, float dist[2]) { - Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); - Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); - - float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir); - float3 aligned_P0 = transform_point(&space0, P), - aligned_P1 = transform_point(&space1, P); - float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), - nrdir1 = -bvh_inverse_direction(aligned_dir1); - - ssef lower_x = ssef(aligned_P0.x * nrdir0.x, - aligned_P1.x * nrdir1.x, - 0.0f, 0.0f), - lower_y = ssef(aligned_P0.y * nrdir0.y, - aligned_P1.y * nrdir1.y, - 0.0f, - 0.0f), - lower_z = ssef(aligned_P0.z * nrdir0.z, - aligned_P1.z * nrdir1.z, - 0.0f, - 0.0f); - - ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), - upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), - upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); - - ssef tnear_x = min(lower_x, upper_x); - ssef tnear_y = min(lower_y, upper_y); - ssef tnear_z = min(lower_z, upper_z); - ssef tfar_x = max(lower_x, upper_x); - ssef tfar_y = max(lower_y, upper_y); - ssef tfar_z = max(lower_z, upper_z); - - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - sseb vmask = tnear <= tfar; - dist[0] = tnear.f[0]; - dist[1] = tnear.f[1]; - - int mask = (int)movemask(vmask); + Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir); + float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f), + lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f), + lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f); + + ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(lower_x, upper_x); + ssef tnear_y = min(lower_y, upper_y); + ssef tnear_z = min(lower_z, upper_z); + ssef tfar_x = max(lower_x, upper_x); + ssef tfar_y = max(lower_y, upper_y); + ssef tfar_z = max(lower_z, upper_z); + + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + sseb vmask = tnear <= tfar; + dist[0] = tnear.f[0]; + dist[1] = tnear.f[1]; + + int mask = (int)movemask(vmask); # ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); - return cmask; + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); + return cmask; # else - return mask & 3; + return mask & 3; # endif } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, const float3 P, const float3 dir, - const ssef& isect_near, - const ssef& isect_far, + const ssef &isect_near, + const ssef &isect_far, const float difl, const int node_addr, const uint visibility, float dist[2]) { - Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); - Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); - - float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir); - float3 aligned_P0 = transform_point(&space0, P), - aligned_P1 = transform_point(&space1, P); - float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), - nrdir1 = -bvh_inverse_direction(aligned_dir1); - - ssef lower_x = ssef(aligned_P0.x * nrdir0.x, - aligned_P1.x * nrdir1.x, - 0.0f, 0.0f), - lower_y = ssef(aligned_P0.y * nrdir0.y, - aligned_P1.y * nrdir1.y, - 0.0f, - 0.0f), - lower_z = ssef(aligned_P0.z * nrdir0.z, - aligned_P1.z * nrdir1.z, - 0.0f, - 0.0f); - - ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), - upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), - upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); - - ssef tnear_x = min(lower_x, upper_x); - ssef tnear_y = min(lower_y, upper_y); - ssef tnear_z = min(lower_z, upper_z); - ssef tfar_x = max(lower_x, upper_x); - ssef tfar_y = max(lower_y, upper_y); - ssef tfar_z = max(lower_z, upper_z); - - const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - sseb vmask; - if(difl != 0.0f) { - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - vmask = round_down*tnear <= round_up*tfar; - } - else { - vmask = tnear <= tfar; - } - - dist[0] = tnear.f[0]; - dist[1] = tnear.f[1]; - - int mask = (int)movemask(vmask); + Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); + Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); + + float3 aligned_dir0 = transform_direction(&space0, dir), + aligned_dir1 = transform_direction(&space1, dir); + float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); + float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), + nrdir1 = -bvh_inverse_direction(aligned_dir1); + + ssef lower_x = ssef(aligned_P0.x * nrdir0.x, aligned_P1.x * nrdir1.x, 0.0f, 0.0f), + lower_y = ssef(aligned_P0.y * nrdir0.y, aligned_P1.y * nrdir1.y, 0.0f, 0.0f), + lower_z = ssef(aligned_P0.z * nrdir0.z, aligned_P1.z * nrdir1.z, 0.0f, 0.0f); + + ssef upper_x = lower_x - ssef(nrdir0.x, nrdir1.x, 0.0f, 0.0f), + upper_y = lower_y - ssef(nrdir0.y, nrdir1.y, 0.0f, 0.0f), + upper_z = lower_z - ssef(nrdir0.z, nrdir1.z, 0.0f, 0.0f); + + ssef tnear_x = min(lower_x, upper_x); + ssef tnear_y = min(lower_y, upper_y); + ssef tnear_z = min(lower_z, upper_z); + ssef tfar_x = max(lower_x, upper_x); + ssef tfar_y = max(lower_y, upper_y); + ssef tfar_z = max(lower_z, upper_z); + + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + sseb vmask; + if (difl != 0.0f) { + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + vmask = round_down * tnear <= round_up * tfar; + } + else { + vmask = tnear <= tfar; + } + + dist[0] = tnear.f[0]; + dist[1] = tnear.f[1]; + + int mask = (int)movemask(vmask); # ifdef __VISIBILITY_FLAG__ - /* this visibility test gives a 5% performance hit, how to solve? */ - float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); - int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility))? 1: 0) | - (((mask & 2) && (__float_as_uint(cnodes.y) & visibility))? 2: 0); - return cmask; + /* this visibility test gives a 5% performance hit, how to solve? */ + float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0); + int cmask = (((mask & 1) && (__float_as_uint(cnodes.x) & visibility)) ? 1 : 0) | + (((mask & 2) && (__float_as_uint(cnodes.y) & visibility)) ? 2 : 0); + return cmask; # else - return mask & 3; + return mask & 3; # endif } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, + const float3 &P, + const float3 &dir, + const ssef &isect_near, + const ssef &isect_far, + const ssef &tsplat, const ssef Psplat[3], const ssef idirsplat[3], const shuffle_swap_t shufflexyz[3], @@ -585,37 +533,23 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, const uint visibility, float dist[2]) { - float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect(kg, - P, - dir, - isect_near, - isect_far, - node_addr, - visibility, - dist); - } - else { - return bvh_aligned_node_intersect(kg, - P, - dir, - tsplat, - Psplat, - idirsplat, - shufflexyz, - node_addr, - visibility, - dist); - } + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect( + kg, P, dir, isect_near, isect_far, node_addr, visibility, dist); + } + else { + return bvh_aligned_node_intersect( + kg, P, dir, tsplat, Psplat, idirsplat, shufflexyz, node_addr, visibility, dist); + } } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, + const float3 &P, + const float3 &dir, + const ssef &isect_near, + const ssef &isect_far, + const ssef &tsplat, const ssef Psplat[3], const ssef idirsplat[3], const shuffle_swap_t shufflexyz[3], @@ -625,31 +559,24 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, const uint visibility, float dist[2]) { - float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return bvh_unaligned_node_intersect_robust(kg, - P, - dir, - isect_near, - isect_far, - difl, - node_addr, - visibility, - dist); - } - else { - return bvh_aligned_node_intersect_robust(kg, - P, - dir, - tsplat, - Psplat, - idirsplat, - shufflexyz, - difl, - extmax, - node_addr, - visibility, - dist); - } + float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return bvh_unaligned_node_intersect_robust( + kg, P, dir, isect_near, isect_far, difl, node_addr, visibility, dist); + } + else { + return bvh_aligned_node_intersect_robust(kg, + P, + dir, + tsplat, + Psplat, + idirsplat, + shufflexyz, + difl, + extmax, + node_addr, + visibility, + dist); + } } -#endif /* !defined(__KERNEL_SSE2__) */ +#endif /* !defined(__KERNEL_SSE2__) */ |