Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@gmail.com>2016-10-02 15:48:39 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2016-10-03 23:15:25 +0300
commita3abb020e37a072eb71fd30de9ab125d1c16623a (patch)
treeb525be7f8a0792eedecb2b95802ede88dc3f330e /intern/cycles/kernel/bvh/bvh_nodes.h
parent49ad4215baf16d850d0e367f003ab688e4a3d08e (diff)
Fix Cycles CUDA performance on CUDA 8.0.
Mostly this is making inlining match CUDA 7.5 in a few performance critical places. The end result is that performance is now better than before, possibly due to less register spilling or other CUDA 8.0 compiler improvements. On benchmarks scenes, there are 3% to 35% render time reductions. Stack memory usage is reduced a little too. Reviewed By: sergey Differential Revision: https://developer.blender.org/D2269
Diffstat (limited to 'intern/cycles/kernel/bvh/bvh_nodes.h')
-rw-r--r--intern/cycles/kernel/bvh/bvh_nodes.h30
1 files changed, 15 insertions, 15 deletions
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index db2275b0ff8..726bef1794c 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -16,7 +16,7 @@
// TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
// 3-vector which might be faster.
-ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
int node_addr,
int child)
{
@@ -30,7 +30,7 @@ ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
}
#if !defined(__KERNEL_SSE2__)
-ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 idir,
const float t,
@@ -77,7 +77,7 @@ ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
#endif
}
-ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 idir,
const float t,
@@ -139,7 +139,7 @@ ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
#endif
}
-ccl_device_inline bool bvh_unaligned_node_intersect_child(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(
KernelGlobals *kg,
const float3 P,
const float3 dir,
@@ -166,7 +166,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child(
return tnear <= tfar;
}
-ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
KernelGlobals *kg,
const float3 P,
const float3 dir,
@@ -202,7 +202,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
}
}
-ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
@@ -232,7 +232,7 @@ ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
return mask;
}
-ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
@@ -264,7 +264,7 @@ ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
return mask;
}
-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
@@ -295,7 +295,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
}
}
-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 dir,
const float3 idir,
@@ -333,7 +333,7 @@ ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
}
#else /* !defined(__KERNEL_SSE2__) */
-int ccl_device_inline bvh_aligned_node_intersect(
+int ccl_device_forceinline bvh_aligned_node_intersect(
KernelGlobals *kg,
const float3& P,
const float3& dir,
@@ -377,7 +377,7 @@ int ccl_device_inline bvh_aligned_node_intersect(
# endif
}
-int ccl_device_inline bvh_aligned_node_intersect_robust(
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(
KernelGlobals *kg,
const float3& P,
const float3& dir,
@@ -441,7 +441,7 @@ int ccl_device_inline bvh_aligned_node_intersect_robust(
# endif
}
-int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
const float3 P,
const float3 dir,
const ssef& isect_near,
@@ -502,7 +502,7 @@ int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
# endif
}
-int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
const float3 P,
const float3 dir,
const ssef& isect_near,
@@ -573,7 +573,7 @@ int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
# endif
}
-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
const float3& P,
const float3& dir,
const ssef& isect_near,
@@ -611,7 +611,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
}
}
-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
const float3& P,
const float3& dir,
const ssef& isect_near,