Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorCampbell Barton <ideasman42@gmail.com>2017-09-11 09:52:53 +0300
committerCampbell Barton <ideasman42@gmail.com>2017-09-11 09:52:53 +0300
commit7d7741d25fb7d248b557c6f5270510d7c0dbe923 (patch)
treecfb288dc6297f35be54f73470b9f465238f88281 /intern
parent49ba9d02d8dc5e1aef3cc0f0b3453866125af64c (diff)
parentf56fea3d6b481b70e5ca518e41dab8c00bc26251 (diff)
Merge branch 'master' into blender2.8
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/CMakeLists.txt86
-rw-r--r--intern/cycles/kernel/bvh/bvh_nodes.h24
-rw-r--r--intern/cycles/kernel/bvh/qbvh_nodes.h8
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h6
4 files changed, 66 insertions, 58 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index c53a9f91cc0..5844c2480d6 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -41,61 +41,65 @@ elseif(WIN32 AND MSVC)
set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2")
endif()
+ # Unlike GCC/clang we still use fast math, because there is no fine
+ # grained control and the speedup we get here is too big to ignore.
+ set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+
# there is no /arch:SSE3, but intrinsics are available anyway
if(CMAKE_CL_64)
- set(CYCLES_SSE2_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
- set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
- set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
- set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
- set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+ set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+ set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+ set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+ set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
+ set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
else()
- set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
- set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
- set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
- set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
- set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+ set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+ set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+ set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+ set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
+ set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
endif()
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox")
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox")
-
- set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-elseif(CMAKE_COMPILER_IS_GNUCC)
+elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
check_cxx_compiler_flag(-msse CXX_HAS_SSE)
check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
- set(CYCLES_KERNEL_FLAGS "-ffast-math")
- if(CXX_HAS_SSE)
- set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
- set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
- set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
- endif()
- if(CXX_HAS_AVX)
- set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
- endif()
- if(CXX_HAS_AVX2)
- set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
+
+ # Assume no signal trapping for better code generation.
+ set(CYCLES_KERNEL_FLAGS "-fno-trapping-math")
+ # Avoid overhead of setting errno for NaNs.
+ set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-math-errno")
+ # Let compiler optimize 0.0 - x without worrying about signed zeros.
+ set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signed-zeros")
+
+ if(CMAKE_COMPILER_IS_GNUCC)
+ # Assume no signal trapping for better code generation.
+ set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signaling-nans")
+ # Assume a fixed rounding mode for better constant folding.
+ set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-rounding-math")
endif()
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
-elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
- check_cxx_compiler_flag(-msse CXX_HAS_SSE)
- check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
- check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
- set(CYCLES_KERNEL_FLAGS "-ffast-math")
+
if(CXX_HAS_SSE)
- set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
- set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
- set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
- endif()
- if(CXX_HAS_AVX)
- set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
- endif()
- if(CXX_HAS_AVX2)
- set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
+ if(CMAKE_COMPILER_IS_GNUCC)
+ set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -mfpmath=sse")
+ endif()
+
+ set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2")
+ set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3")
+ set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS} -msse4.1")
+ if(CXX_HAS_AVX)
+ set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx")
+ endif()
+ if(CXX_HAS_AVX2)
+ set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
+ endif()
endif()
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
+
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}")
endif()
if(CXX_HAS_SSE)
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 5f1dd434a44..6c33dad5426 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -52,8 +52,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
- float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+ float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+ float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
@@ -61,8 +61,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
- float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+ float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+ float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
dist[0] = c0min;
dist[1] = c1min;
@@ -101,8 +101,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
float c0hiy = (node1.z - P.y) * idir.y;
float c0loz = (node2.x - P.z) * idir.z;
float c0hiz = (node2.z - P.z) * idir.z;
- float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
- float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+ float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+ float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
float c1lox = (node0.y - P.x) * idir.x;
float c1hix = (node0.w - P.x) * idir.x;
@@ -110,8 +110,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
float c1hiy = (node1.w - P.y) * idir.y;
float c1loz = (node2.y - P.z) * idir.z;
float c1hiz = (node2.w - P.z) * idir.z;
- float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
- float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+ float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+ float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
if(difl != 0.0f) {
float hdiff = 1.0f + difl;
@@ -483,8 +483,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
ssef tfar_y = max(lower_y, upper_y);
ssef tfar_z = max(lower_z, upper_z);
- const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
- const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
sseb vmask = tnear <= tfar;
dist[0] = tnear.f[0];
dist[1] = tnear.f[1];
@@ -545,8 +545,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
ssef tfar_y = max(lower_y, upper_y);
ssef tfar_z = max(lower_z, upper_z);
- const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
- const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
sseb vmask;
if(difl != 0.0f) {
const float round_down = 1.0f - difl;
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 6d22f0b0d6a..3036efd4198 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg
const sseb vmask = cast(tnear) > cast(tfar);
int mask = (int)movemask(vmask)^0xf;
#else
- const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
- const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
const sseb vmask = tnear <= tfar;
int mask = (int)movemask(vmask);
#endif
@@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust(
const float round_down = 1.0f - difl;
const float round_up = 1.0f + difl;
- const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
- const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+ const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+ const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
const sseb vmask = round_down*tnear <= round_up*tfar;
*dist = tnear;
return (int)movemask(vmask);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 6d1cf055f2c..1e2af9de8b3 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -38,11 +38,15 @@
/* Qualifier wrappers for different names on different devices */
#define ccl_device __device__ __inline__
+#if __CUDA_ARCH__ < 300
+# define ccl_device_inline __device__ __inline__
# define ccl_device_forceinline __device__ __forceinline__
-#if __CUDA_ARCH__ < 500
+#elif __CUDA_ARCH__ < 500
# define ccl_device_inline __device__ __forceinline__
+# define ccl_device_forceinline __device__ __forceinline__
#else
# define ccl_device_inline __device__ __inline__
+# define ccl_device_forceinline __device__ __forceinline__
#endif
#define ccl_device_noinline __device__ __noinline__
#define ccl_global