diff options
-rw-r--r-- | intern/cycles/CMakeLists.txt | 86 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/bvh_nodes.h | 24 | ||||
-rw-r--r-- | intern/cycles/kernel/bvh/qbvh_nodes.h | 8 |
3 files changed, 61 insertions, 57 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index c53a9f91cc0..5844c2480d6 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -41,61 +41,65 @@ elseif(WIN32 AND MSVC) set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2") endif() + # Unlike GCC/clang we still use fast math, because there is no fine + # grained control and the speedup we get here is too big to ignore. + set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + # there is no /arch:SSE3, but intrinsics are available anyway if(CMAKE_CL_64) - set(CYCLES_SSE2_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") else() - set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox") set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox") - - set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") -elseif(CMAKE_COMPILER_IS_GNUCC) +elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) check_cxx_compiler_flag(-msse CXX_HAS_SSE) check_cxx_compiler_flag(-mavx CXX_HAS_AVX) check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2) - set(CYCLES_KERNEL_FLAGS "-ffast-math") - if(CXX_HAS_SSE) - set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse") - set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse") - set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse") - endif() - if(CXX_HAS_AVX) - set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse") - endif() - if(CXX_HAS_AVX2) - set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse") + + # Assume no signal trapping for better code generation. + set(CYCLES_KERNEL_FLAGS "-fno-trapping-math") + # Avoid overhead of setting errno for NaNs. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-math-errno") + # Let compiler optimize 0.0 - x without worrying about signed zeros. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signed-zeros") + + if(CMAKE_COMPILER_IS_GNUCC) + # Assume no signal trapping for better code generation. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signaling-nans") + # Assume a fixed rounding mode for better constant folding. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-rounding-math") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only") -elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - check_cxx_compiler_flag(-msse CXX_HAS_SSE) - check_cxx_compiler_flag(-mavx CXX_HAS_AVX) - check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2) - set(CYCLES_KERNEL_FLAGS "-ffast-math") + if(CXX_HAS_SSE) - set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2") - set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3") - set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1") - endif() - if(CXX_HAS_AVX) - set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx") - endif() - if(CXX_HAS_AVX2) - set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") + if(CMAKE_COMPILER_IS_GNUCC) + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -mfpmath=sse") + endif() + + set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2") + set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3") + set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS} -msse4.1") + if(CXX_HAS_AVX) + set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx") + endif() + if(CXX_HAS_AVX2) + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") + endif() endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only") + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}") endif() if(CXX_HAS_SSE) diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index 5f1dd434a44..6c33dad5426 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -52,8 +52,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -61,8 +61,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); dist[0] = c0min; dist[1] = c1min; @@ -101,8 +101,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -110,8 +110,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); if(difl != 0.0f) { float hdiff = 1.0f + difl; @@ -483,8 +483,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask = tnear <= tfar; dist[0] = tnear.f[0]; dist[1] = tnear.f[1]; @@ -545,8 +545,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask; if(difl != 0.0f) { const float round_down = 1.0f - difl; diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h index 6d22f0b0d6a..3036efd4198 100644 --- a/intern/cycles/kernel/bvh/qbvh_nodes.h +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg const sseb vmask = cast(tnear) > cast(tfar); int mask = (int)movemask(vmask)^0xf; #else - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = tnear <= tfar; int mask = (int)movemask(vmask); #endif @@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust( const float round_down = 1.0f - difl; const float round_up = 1.0f + difl; - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = round_down*tnear <= round_up*tfar; *dist = tnear; return (int)movemask(vmask); |