/* * Copyright 2011-2014, Blender Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Aligned nodes intersection SSE code is adopted from Embree, */ struct QBVHStackItem { int addr; float dist; }; ccl_device_inline void qbvh_near_far_idx_calc(const float3& idir, int *ccl_restrict near_x, int *ccl_restrict near_y, int *ccl_restrict near_z, int *ccl_restrict far_x, int *ccl_restrict far_y, int *ccl_restrict far_z) { #ifdef __KERNEL_SSE__ *near_x = 0; *far_x = 1; *near_y = 2; *far_y = 3; *near_z = 4; *far_z = 5; const size_t mask = movemask(ssef(idir.m128)); const int mask_x = mask & 1; const int mask_y = (mask & 2) >> 1; const int mask_z = (mask & 4) >> 2; *near_x += mask_x; *far_x -= mask_x; *near_y += mask_y; *far_y -= mask_y; *near_z += mask_z; *far_z -= mask_z; #else if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; } if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; } if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; } #endif } /* TOOD(sergey): Investigate if using intrinsics helps for both * stack item swap and float comparison. */ ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a, QBVHStackItem *ccl_restrict b) { QBVHStackItem tmp = *a; *a = *b; *b = tmp; } ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, QBVHStackItem *ccl_restrict s2, QBVHStackItem *ccl_restrict s3) { if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } } ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1, QBVHStackItem *ccl_restrict s2, QBVHStackItem *ccl_restrict s3, QBVHStackItem *ccl_restrict s4) { if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); } if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); } if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); } if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); } if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); } } /* Axis-aligned nodes intersection */ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, const ssef& isect_near, const ssef& isect_far, #ifdef __KERNEL_AVX2__ const sse3f& org_idir, #else const sse3f& org, #endif const sse3f& idir, const int near_x, const int near_y, const int near_z, const int far_x, const int far_y, const int far_z, const int node_addr, ssef *ccl_restrict dist) { const int offset = node_addr + 1; #ifdef __KERNEL_AVX2__ const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x); const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y); const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z); const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x); const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y); const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z); #else const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x; const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y; const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z; const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x; const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y; const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z; #endif #ifdef __KERNEL_SSE41__ const ssef tnear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, isect_near)); const ssef tfar = mini(mini(tfar_x, tfar_y), mini(tfar_z, isect_far)); const sseb vmask = cast(tnear) > cast(tfar); int mask = (int)movemask(vmask)^0xf; #else const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = tnear <= tfar; int mask = (int)movemask(vmask); #endif *dist = tnear; return mask; } ccl_device_inline int qbvh_aligned_node_intersect_robust( KernelGlobals *ccl_restrict kg, const ssef& isect_near, const ssef& isect_far, #ifdef __KERNEL_AVX2__ const sse3f& P_idir, #else const sse3f& P, #endif const sse3f& idir, const int near_x, const int near_y, const int near_z, const int far_x, const int far_y, const int far_z, const int node_addr, const float difl, ssef *ccl_restrict dist) { const int offset = node_addr + 1; #ifdef __KERNEL_AVX2__ const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x); const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y); const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z); const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x); const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y); const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z); #else const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x; const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y; const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z; const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x; const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y; const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z; #endif const float round_down = 1.0f - difl; const float round_up = 1.0f + difl; const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = round_down*tnear <= round_up*tfar; *dist = tnear; return (int)movemask(vmask); } /* Unaligned nodes intersection */ ccl_device_inline int qbvh_unaligned_node_intersect( KernelGlobals *ccl_restrict kg, const ssef& isect_near, const ssef& isect_far, #ifdef __KERNEL_AVX2__ const sse3f& org_idir, #endif const sse3f& org, const sse3f& dir, const sse3f& idir, const int near_x, const int near_y, const int near_z, const int far_x, const int far_y, const int far_z, const int node_addr, ssef *ccl_restrict dist) { const int offset = node_addr; const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; const ssef aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x, aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y, aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z; const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); const ssef nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, nrdir_z = neg_one / aligned_dir_z; const ssef tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, tlower_z = aligned_P_z * nrdir_z; const ssef tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, tupper_z = tlower_z - nrdir_z; #ifdef __KERNEL_SSE41__ const ssef tnear_x = mini(tlower_x, tupper_x); const ssef tnear_y = mini(tlower_y, tupper_y); const ssef tnear_z = mini(tlower_z, tupper_z); const ssef tfar_x = maxi(tlower_x, tupper_x); const ssef tfar_y = maxi(tlower_y, tupper_y); const ssef tfar_z = maxi(tlower_z, tupper_z); const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = tnear <= tfar; *dist = tnear; return movemask(vmask); #else const ssef tnear_x = min(tlower_x, tupper_x); const ssef tnear_y = min(tlower_y, tupper_y); const ssef tnear_z = min(tlower_z, tupper_z); const ssef tfar_x = max(tlower_x, tupper_x); const ssef tfar_y = max(tlower_y, tupper_y); const ssef tfar_z = max(tlower_z, tupper_z); const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = tnear <= tfar; *dist = tnear; return movemask(vmask); #endif } ccl_device_inline int qbvh_unaligned_node_intersect_robust( KernelGlobals *ccl_restrict kg, const ssef& isect_near, const ssef& isect_far, #ifdef __KERNEL_AVX2__ const sse3f& P_idir, #endif const sse3f& P, const sse3f& dir, const sse3f& idir, const int near_x, const int near_y, const int near_z, const int far_x, const int far_y, const int far_z, const int node_addr, const float difl, ssef *ccl_restrict dist) { const int offset = node_addr; const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1); const ssef tfm_x_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+2); const ssef tfm_x_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+3); const ssef tfm_y_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+4); const ssef tfm_y_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+5); const ssef tfm_y_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+6); const ssef tfm_z_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+7); const ssef tfm_z_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+8); const ssef tfm_z_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+9); const ssef tfm_t_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+10); const ssef tfm_t_y = kernel_tex_fetch_ssef(__bvh_nodes, offset+11); const ssef tfm_t_z = kernel_tex_fetch_ssef(__bvh_nodes, offset+12); const ssef aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; const ssef aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x, aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y, aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z; const ssef neg_one(-1.0f, -1.0f, -1.0f, -1.0f); const ssef nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, nrdir_z = neg_one / aligned_dir_z; const ssef tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, tlower_z = aligned_P_z * nrdir_z; const ssef tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, tupper_z = tlower_z - nrdir_z; const float round_down = 1.0f - difl; const float round_up = 1.0f + difl; #ifdef __KERNEL_SSE41__ const ssef tnear_x = mini(tlower_x, tupper_x); const ssef tnear_y = mini(tlower_y, tupper_y); const ssef tnear_z = mini(tlower_z, tupper_z); const ssef tfar_x = maxi(tlower_x, tupper_x); const ssef tfar_y = maxi(tlower_y, tupper_y); const ssef tfar_z = maxi(tlower_z, tupper_z); #else const ssef tnear_x = min(tlower_x, tupper_x); const ssef tnear_y = min(tlower_y, tupper_y); const ssef tnear_z = min(tlower_z, tupper_z); const ssef tfar_x = max(tlower_x, tupper_x); const ssef tfar_y = max(tlower_y, tupper_y); const ssef tfar_z = max(tlower_z, tupper_z); #endif const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = round_down*tnear <= round_up*tfar; *dist = tnear; return movemask(vmask); } /* Intersectors wrappers. * * They'll check node type and call appropriate intersection code. */ ccl_device_inline int qbvh_node_intersect( KernelGlobals *ccl_restrict kg, const ssef& isect_near, const ssef& isect_far, #ifdef __KERNEL_AVX2__ const sse3f& org_idir, #endif const sse3f& org, const sse3f& dir, const sse3f& idir, const int near_x, const int near_y, const int near_z, const int far_x, const int far_y, const int far_z, const int node_addr, ssef *ccl_restrict dist) { const int offset = node_addr; const float4 node = kernel_tex_fetch(__bvh_nodes, offset); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { return qbvh_unaligned_node_intersect(kg, isect_near, isect_far, #ifdef __KERNEL_AVX2__ org_idir, #endif org, dir, idir, near_x, near_y, near_z, far_x, far_y, far_z, node_addr, dist); } else { return qbvh_aligned_node_intersect(kg, isect_near, isect_far, #ifdef __KERNEL_AVX2__ org_idir, #else org, #endif idir, near_x, near_y, near_z, far_x, far_y, far_z, node_addr, dist); } } ccl_device_inline int qbvh_node_intersect_robust( KernelGlobals *ccl_restrict kg, const ssef& isect_near, const ssef& isect_far, #ifdef __KERNEL_AVX2__ const sse3f& P_idir, #endif const sse3f& P, const sse3f& dir, const sse3f& idir, const int near_x, const int near_y, const int near_z, const int far_x, const int far_y, const int far_z, const int node_addr, const float difl, ssef *ccl_restrict dist) { const int offset = node_addr; const float4 node = kernel_tex_fetch(__bvh_nodes, offset); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { return qbvh_unaligned_node_intersect_robust(kg, isect_near, isect_far, #ifdef __KERNEL_AVX2__ P_idir, #endif P, dir, idir, near_x, near_y, near_z, far_x, far_y, far_z, node_addr, difl, dist); } else { return qbvh_aligned_node_intersect_robust(kg, isect_near, isect_far, #ifdef __KERNEL_AVX2__ P_idir, #else P, #endif idir, near_x, near_y, near_z, far_x, far_y, far_z, node_addr, difl, dist); } }