diff options
Diffstat (limited to 'intern/cycles/kernel/bvh/obvh_nodes.h')
-rw-r--r-- | intern/cycles/kernel/bvh/obvh_nodes.h | 817 |
1 files changed, 438 insertions, 379 deletions
diff --git a/intern/cycles/kernel/bvh/obvh_nodes.h b/intern/cycles/kernel/bvh/obvh_nodes.h index 93f35f6dffb..6831562cade 100644 --- a/intern/cycles/kernel/bvh/obvh_nodes.h +++ b/intern/cycles/kernel/bvh/obvh_nodes.h @@ -17,11 +17,11 @@ */ struct OBVHStackItem { - int addr; - float dist; + int addr; + float dist; }; -ccl_device_inline void obvh_near_far_idx_calc(const float3& idir, +ccl_device_inline void obvh_near_far_idx_calc(const float3 &idir, int *ccl_restrict near_x, int *ccl_restrict near_y, int *ccl_restrict near_z, @@ -31,41 +31,73 @@ ccl_device_inline void obvh_near_far_idx_calc(const float3& idir, { #ifdef __KERNEL_SSE__ - *near_x = 0; *far_x = 1; - *near_y = 2; *far_y = 3; - *near_z = 4; *far_z = 5; - - const size_t mask = movemask(ssef(idir.m128)); - - const int mask_x = mask & 1; - const int mask_y = (mask & 2) >> 1; - const int mask_z = (mask & 4) >> 2; - - *near_x += mask_x; *far_x -= mask_x; - *near_y += mask_y; *far_y -= mask_y; - *near_z += mask_z; *far_z -= mask_z; + *near_x = 0; + *far_x = 1; + *near_y = 2; + *far_y = 3; + *near_z = 4; + *far_z = 5; + + const size_t mask = movemask(ssef(idir.m128)); + + const int mask_x = mask & 1; + const int mask_y = (mask & 2) >> 1; + const int mask_z = (mask & 4) >> 2; + + *near_x += mask_x; + *far_x -= mask_x; + *near_y += mask_y; + *far_y -= mask_y; + *near_z += mask_z; + *far_z -= mask_z; #else - if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; } - if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; } - if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; } + if (idir.x >= 0.0f) { + *near_x = 0; + *far_x = 1; + } + else { + *near_x = 1; + *far_x = 0; + } + if (idir.y >= 0.0f) { + *near_y = 2; + *far_y = 3; + } + else { + *near_y = 3; + *far_y = 2; + } + if (idir.z >= 0.0f) { + *near_z = 4; + *far_z = 5; + } + else { + *near_z = 5; + *far_z = 4; + } #endif } -ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a, - OBVHStackItem *ccl_restrict b) +ccl_device_inline void obvh_item_swap(OBVHStackItem *ccl_restrict a, OBVHStackItem *ccl_restrict b) { - OBVHStackItem tmp = *a; - *a = *b; - *b = tmp; + OBVHStackItem tmp = *a; + *a = *b; + *b = tmp; } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s2, OBVHStackItem *ccl_restrict s3) { - if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } - if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); } - if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } + if (s2->dist < s1->dist) { + obvh_item_swap(s2, s1); + } + if (s3->dist < s2->dist) { + obvh_item_swap(s3, s2); + } + if (s2->dist < s1->dist) { + obvh_item_swap(s2, s1); + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -73,11 +105,21 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s3, OBVHStackItem *ccl_restrict s4) { - if(s2->dist < s1->dist) { obvh_item_swap(s2, s1); } - if(s4->dist < s3->dist) { obvh_item_swap(s4, s3); } - if(s3->dist < s1->dist) { obvh_item_swap(s3, s1); } - if(s4->dist < s2->dist) { obvh_item_swap(s4, s2); } - if(s3->dist < s2->dist) { obvh_item_swap(s3, s2); } + if (s2->dist < s1->dist) { + obvh_item_swap(s2, s1); + } + if (s4->dist < s3->dist) { + obvh_item_swap(s4, s3); + } + if (s3->dist < s1->dist) { + obvh_item_swap(s3, s1); + } + if (s4->dist < s2->dist) { + obvh_item_swap(s4, s2); + } + if (s3->dist < s2->dist) { + obvh_item_swap(s3, s2); + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -86,19 +128,19 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s4, OBVHStackItem *ccl_restrict s5) { - obvh_stack_sort(s1, s2, s3, s4); - if(s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if(s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if(s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if(s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } + obvh_stack_sort(s1, s2, s3, s4); + if (s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if (s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if (s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if (s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -108,22 +150,22 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s5, OBVHStackItem *ccl_restrict s6) { - obvh_stack_sort(s1, s2, s3, s4, s5); - if(s6->dist < s5->dist) { - obvh_item_swap(s5, s6); - if(s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if(s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if(s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if(s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } - } + obvh_stack_sort(s1, s2, s3, s4, s5); + if (s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if (s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if (s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if (s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if (s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -134,25 +176,25 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s6, OBVHStackItem *ccl_restrict s7) { - obvh_stack_sort(s1, s2, s3, s4, s5, s6); - if(s7->dist < s6->dist) { - obvh_item_swap(s6, s7); - if(s6->dist < s5->dist) { - obvh_item_swap(s5, s6); - if(s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if(s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if(s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if(s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } - } - } + obvh_stack_sort(s1, s2, s3, s4, s5, s6); + if (s7->dist < s6->dist) { + obvh_item_swap(s6, s7); + if (s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if (s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if (s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if (s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if (s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } + } } ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, @@ -164,41 +206,41 @@ ccl_device_inline void obvh_stack_sort(OBVHStackItem *ccl_restrict s1, OBVHStackItem *ccl_restrict s7, OBVHStackItem *ccl_restrict s8) { - obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7); - if(s8->dist < s7->dist) { - obvh_item_swap(s7, s8); - if(s7->dist < s6->dist) { - obvh_item_swap(s6, s7); - if(s6->dist < s5->dist) { - obvh_item_swap(s5, s6); - if(s5->dist < s4->dist) { - obvh_item_swap(s4, s5); - if(s4->dist < s3->dist) { - obvh_item_swap(s3, s4); - if(s3->dist < s2->dist) { - obvh_item_swap(s2, s3); - if(s2->dist < s1->dist) { - obvh_item_swap(s1, s2); - } - } - } - } - } - } - } + obvh_stack_sort(s1, s2, s3, s4, s5, s6, s7); + if (s8->dist < s7->dist) { + obvh_item_swap(s7, s8); + if (s7->dist < s6->dist) { + obvh_item_swap(s6, s7); + if (s6->dist < s5->dist) { + obvh_item_swap(s5, s6); + if (s5->dist < s4->dist) { + obvh_item_swap(s4, s5); + if (s4->dist < s3->dist) { + obvh_item_swap(s3, s4); + if (s3->dist < s2->dist) { + obvh_item_swap(s2, s3); + if (s2->dist < s1->dist) { + obvh_item_swap(s1, s2); + } + } + } + } + } + } + } } /* Axis-aligned nodes intersection */ ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& org_idir, + const avx3f &org_idir, #else - const avx3f& org, + const avx3f &org, #endif - const avx3f& idir, + const avx3f &idir, const int near_x, const int near_y, const int near_z, @@ -208,213 +250,216 @@ ccl_device_inline int obvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg const int node_addr, avxf *ccl_restrict dist) { - const int offset = node_addr + 2; + const int offset = node_addr + 2; #ifdef __KERNEL_AVX2__ - const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_x*2), idir.x, org_idir.x); - const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_y*2), idir.y, org_idir.y); - const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+near_z*2), idir.z, org_idir.z); - const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_x*2), idir.x, org_idir.x); - const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_y*2), idir.y, org_idir.y); - const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset+far_z*2), idir.z, org_idir.z); - - const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); - const avxb vmask = tnear <= tfar; - int mask = (int)movemask(vmask); - *dist = tnear; - return mask; + const avxf tnear_x = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, org_idir.x); + const avxf tnear_y = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, org_idir.y); + const avxf tnear_z = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, org_idir.z); + const avxf tfar_x = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, org_idir.x); + const avxf tfar_y = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, org_idir.y); + const avxf tfar_z = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, org_idir.z); + + const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const avxb vmask = tnear <= tfar; + int mask = (int)movemask(vmask); + *dist = tnear; + return mask; #else - return 0; + return 0; #endif } -ccl_device_inline int obvh_aligned_node_intersect_robust( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_aligned_node_intersect_robust(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& P_idir, + const avx3f &P_idir, #else - const avx3f& P, + const avx3f &P, #endif - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - const float difl, - avxf *ccl_restrict dist) + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) { - const int offset = node_addr + 2; + const int offset = node_addr + 2; #ifdef __KERNEL_AVX2__ - const avxf tnear_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x); - const avxf tfar_x = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x); - const avxf tnear_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y); - const avxf tfar_y = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y); - const avxf tnear_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z); - const avxf tfar_z = msub(kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z); - - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); - const avxb vmask = round_down*tnear <= round_up*tfar; - int mask = (int)movemask(vmask); - *dist = tnear; - return mask; + const avxf tnear_x = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_x * 2), idir.x, P_idir.x); + const avxf tfar_x = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_x * 2), idir.x, P_idir.x); + const avxf tnear_y = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_y * 2), idir.y, P_idir.y); + const avxf tfar_y = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_y * 2), idir.y, P_idir.y); + const avxf tnear_z = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + near_z * 2), idir.z, P_idir.z); + const avxf tfar_z = msub( + kernel_tex_fetch_avxf(__bvh_nodes, offset + far_z * 2), idir.z, P_idir.z); + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + const avxf tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); + const avxf tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const avxb vmask = round_down * tnear <= round_up * tfar; + int mask = (int)movemask(vmask); + *dist = tnear; + return mask; #else - return 0; + return 0; #endif } /* Unaligned nodes intersection */ -ccl_device_inline int obvh_unaligned_node_intersect( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_unaligned_node_intersect(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& org_idir, + const avx3f &org_idir, #endif - const avx3f& org, - const avx3f& dir, - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - avxf *ccl_restrict dist) + const avx3f &org, + const avx3f &dir, + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) { - const int offset = node_addr; - const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2); - const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4); - const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6); - - const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8); - const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10); - const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12); - - const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14); - const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16); - const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18); - - const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20); - const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22); - const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24); - - const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, - aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, - aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; - - const avxf aligned_P_x = org.x*tfm_x_x + org.y*tfm_x_y + org.z*tfm_x_z + tfm_t_x, - aligned_P_y = org.x*tfm_y_x + org.y*tfm_y_y + org.z*tfm_y_z + tfm_t_y, - aligned_P_z = org.x*tfm_z_x + org.y*tfm_z_y + org.z*tfm_z_z + tfm_t_z; - - const avxf neg_one(-1.0f); - const avxf nrdir_x = neg_one / aligned_dir_x, - nrdir_y = neg_one / aligned_dir_y, - nrdir_z = neg_one / aligned_dir_z; - - const avxf tlower_x = aligned_P_x * nrdir_x, - tlower_y = aligned_P_y * nrdir_y, - tlower_z = aligned_P_z * nrdir_z; - - const avxf tupper_x = tlower_x - nrdir_x, - tupper_y = tlower_y - nrdir_y, - tupper_z = tlower_z - nrdir_z; - - const avxf tnear_x = min(tlower_x, tupper_x); - const avxf tnear_y = min(tlower_y, tupper_y); - const avxf tnear_z = min(tlower_z, tupper_z); - const avxf tfar_x = max(tlower_x, tupper_x); - const avxf tfar_y = max(tlower_y, tupper_y); - const avxf tfar_z = max(tlower_z, tupper_z); - const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const avxb vmask = tnear <= tfar; - *dist = tnear; - return movemask(vmask); + const int offset = node_addr; + const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 2); + const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 4); + const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 6); + + const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 8); + const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 10); + const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 12); + + const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 14); + const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 16); + const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 18); + + const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 20); + const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 22); + const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 24); + + const avxf aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z, + aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z, + aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z; + + const avxf aligned_P_x = org.x * tfm_x_x + org.y * tfm_x_y + org.z * tfm_x_z + tfm_t_x, + aligned_P_y = org.x * tfm_y_x + org.y * tfm_y_y + org.z * tfm_y_z + tfm_t_y, + aligned_P_z = org.x * tfm_z_x + org.y * tfm_z_y + org.z * tfm_z_z + tfm_t_z; + + const avxf neg_one(-1.0f); + const avxf nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const avxf tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const avxf tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const avxf tnear_x = min(tlower_x, tupper_x); + const avxf tnear_y = min(tlower_y, tupper_y); + const avxf tnear_z = min(tlower_z, tupper_z); + const avxf tfar_x = max(tlower_x, tupper_x); + const avxf tfar_y = max(tlower_y, tupper_y); + const avxf tfar_z = max(tlower_z, tupper_z); + const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const avxb vmask = tnear <= tfar; + *dist = tnear; + return movemask(vmask); } -ccl_device_inline int obvh_unaligned_node_intersect_robust( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_unaligned_node_intersect_robust(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& P_idir, + const avx3f &P_idir, #endif - const avx3f& P, - const avx3f& dir, - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - const float difl, - avxf *ccl_restrict dist) + const avx3f &P, + const avx3f &dir, + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) { - const int offset = node_addr; - const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+2); - const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+4); - const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+6); - - const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+8); - const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+10); - const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+12); - - const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+14); - const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+16); - const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+18); - - const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset+20); - const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset+22); - const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset+24); - - const avxf aligned_dir_x = dir.x*tfm_x_x + dir.y*tfm_x_y + dir.z*tfm_x_z, - aligned_dir_y = dir.x*tfm_y_x + dir.y*tfm_y_y + dir.z*tfm_y_z, - aligned_dir_z = dir.x*tfm_z_x + dir.y*tfm_z_y + dir.z*tfm_z_z; - - const avxf aligned_P_x = P.x*tfm_x_x + P.y*tfm_x_y + P.z*tfm_x_z + tfm_t_x, - aligned_P_y = P.x*tfm_y_x + P.y*tfm_y_y + P.z*tfm_y_z + tfm_t_y, - aligned_P_z = P.x*tfm_z_x + P.y*tfm_z_y + P.z*tfm_z_z + tfm_t_z; - - const avxf neg_one(-1.0f); - const avxf nrdir_x = neg_one / aligned_dir_x, - nrdir_y = neg_one / aligned_dir_y, - nrdir_z = neg_one / aligned_dir_z; - - const avxf tlower_x = aligned_P_x * nrdir_x, - tlower_y = aligned_P_y * nrdir_y, - tlower_z = aligned_P_z * nrdir_z; - - const avxf tupper_x = tlower_x - nrdir_x, - tupper_y = tlower_y - nrdir_y, - tupper_z = tlower_z - nrdir_z; - - const float round_down = 1.0f - difl; - const float round_up = 1.0f + difl; - - const avxf tnear_x = min(tlower_x, tupper_x); - const avxf tnear_y = min(tlower_y, tupper_y); - const avxf tnear_z = min(tlower_z, tupper_z); - const avxf tfar_x = max(tlower_x, tupper_x); - const avxf tfar_y = max(tlower_y, tupper_y); - const avxf tfar_z = max(tlower_z, tupper_z); - - const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); - const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); - const avxb vmask = round_down*tnear <= round_up*tfar; - *dist = tnear; - return movemask(vmask); + const int offset = node_addr; + const avxf tfm_x_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 2); + const avxf tfm_x_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 4); + const avxf tfm_x_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 6); + + const avxf tfm_y_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 8); + const avxf tfm_y_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 10); + const avxf tfm_y_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 12); + + const avxf tfm_z_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 14); + const avxf tfm_z_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 16); + const avxf tfm_z_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 18); + + const avxf tfm_t_x = kernel_tex_fetch_avxf(__bvh_nodes, offset + 20); + const avxf tfm_t_y = kernel_tex_fetch_avxf(__bvh_nodes, offset + 22); + const avxf tfm_t_z = kernel_tex_fetch_avxf(__bvh_nodes, offset + 24); + + const avxf aligned_dir_x = dir.x * tfm_x_x + dir.y * tfm_x_y + dir.z * tfm_x_z, + aligned_dir_y = dir.x * tfm_y_x + dir.y * tfm_y_y + dir.z * tfm_y_z, + aligned_dir_z = dir.x * tfm_z_x + dir.y * tfm_z_y + dir.z * tfm_z_z; + + const avxf aligned_P_x = P.x * tfm_x_x + P.y * tfm_x_y + P.z * tfm_x_z + tfm_t_x, + aligned_P_y = P.x * tfm_y_x + P.y * tfm_y_y + P.z * tfm_y_z + tfm_t_y, + aligned_P_z = P.x * tfm_z_x + P.y * tfm_z_y + P.z * tfm_z_z + tfm_t_z; + + const avxf neg_one(-1.0f); + const avxf nrdir_x = neg_one / aligned_dir_x, nrdir_y = neg_one / aligned_dir_y, + nrdir_z = neg_one / aligned_dir_z; + + const avxf tlower_x = aligned_P_x * nrdir_x, tlower_y = aligned_P_y * nrdir_y, + tlower_z = aligned_P_z * nrdir_z; + + const avxf tupper_x = tlower_x - nrdir_x, tupper_y = tlower_y - nrdir_y, + tupper_z = tlower_z - nrdir_z; + + const float round_down = 1.0f - difl; + const float round_up = 1.0f + difl; + + const avxf tnear_x = min(tlower_x, tupper_x); + const avxf tnear_y = min(tlower_y, tupper_y); + const avxf tnear_z = min(tlower_z, tupper_z); + const avxf tfar_x = max(tlower_x, tupper_x); + const avxf tfar_y = max(tlower_y, tupper_y); + const avxf tfar_z = max(tlower_z, tupper_z); + + const avxf tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const avxf tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); + const avxb vmask = round_down * tnear <= round_up * tfar; + *dist = tnear; + return movemask(vmask); } /* Intersectors wrappers. @@ -422,111 +467,125 @@ ccl_device_inline int obvh_unaligned_node_intersect_robust( * They'll check node type and call appropriate intersection code. */ -ccl_device_inline int obvh_node_intersect( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_node_intersect(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& org_idir, + const avx3f &org_idir, #endif - const avx3f& org, - const avx3f& dir, - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - avxf *ccl_restrict dist) + const avx3f &org, + const avx3f &dir, + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + avxf *ccl_restrict dist) { - const int offset = node_addr; - const float4 node = kernel_tex_fetch(__bvh_nodes, offset); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return obvh_unaligned_node_intersect(kg, - isect_near, - isect_far, + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return obvh_unaligned_node_intersect(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - org_idir, + org_idir, #endif - org, - dir, - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - dist); - } - else { - return obvh_aligned_node_intersect(kg, - isect_near, - isect_far, + org, + dir, + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + dist); + } + else { + return obvh_aligned_node_intersect(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - org_idir, + org_idir, #else - org, + org, #endif - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - dist); - } + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + dist); + } } -ccl_device_inline int obvh_node_intersect_robust( - KernelGlobals *ccl_restrict kg, - const avxf& isect_near, - const avxf& isect_far, +ccl_device_inline int obvh_node_intersect_robust(KernelGlobals *ccl_restrict kg, + const avxf &isect_near, + const avxf &isect_far, #ifdef __KERNEL_AVX2__ - const avx3f& P_idir, + const avx3f &P_idir, #endif - const avx3f& P, - const avx3f& dir, - const avx3f& idir, - const int near_x, - const int near_y, - const int near_z, - const int far_x, - const int far_y, - const int far_z, - const int node_addr, - const float difl, - avxf *ccl_restrict dist) + const avx3f &P, + const avx3f &dir, + const avx3f &idir, + const int near_x, + const int near_y, + const int near_z, + const int far_x, + const int far_y, + const int far_z, + const int node_addr, + const float difl, + avxf *ccl_restrict dist) { - const int offset = node_addr; - const float4 node = kernel_tex_fetch(__bvh_nodes, offset); - if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { - return obvh_unaligned_node_intersect_robust(kg, - isect_near, - isect_far, + const int offset = node_addr; + const float4 node = kernel_tex_fetch(__bvh_nodes, offset); + if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { + return obvh_unaligned_node_intersect_robust(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - P_idir, + P_idir, #endif - P, - dir, - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - difl, - dist); - } - else { - return obvh_aligned_node_intersect_robust(kg, - isect_near, - isect_far, + P, + dir, + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + difl, + dist); + } + else { + return obvh_aligned_node_intersect_robust(kg, + isect_near, + isect_far, #ifdef __KERNEL_AVX2__ - P_idir, + P_idir, #else - P, + P, #endif - idir, - near_x, near_y, near_z, - far_x, far_y, far_z, - node_addr, - difl, - dist); - } + idir, + near_x, + near_y, + near_z, + far_x, + far_y, + far_z, + node_addr, + difl, + dist); + } } |