Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@gmail.com>2016-10-02 15:48:39 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2016-10-03 23:15:25 +0300
commita3abb020e37a072eb71fd30de9ab125d1c16623a (patch)
treeb525be7f8a0792eedecb2b95802ede88dc3f330e /intern/cycles/kernel/kernel_random.h
parent49ad4215baf16d850d0e367f003ab688e4a3d08e (diff)
Fix Cycles CUDA performance on CUDA 8.0.
Mostly this is making inlining match CUDA 7.5 in a few performance critical places. The end result is that performance is now better than before, possibly due to less register spilling or other CUDA 8.0 compiler improvements. On benchmarks scenes, there are 3% to 35% render time reductions. Stack memory usage is reduced a little too. Reviewed By: sergey Differential Revision: https://developer.blender.org/D2269
Diffstat (limited to 'intern/cycles/kernel/kernel_random.h')
-rw-r--r--intern/cycles/kernel/kernel_random.h12
1 files changed, 3 insertions, 9 deletions
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index b534d9950c5..4a76ffddbe7 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
return index;
}
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -132,13 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng,
#endif
}
-/* Temporary workaround for Pascal cards, otherwise AA does not work properly. */
-#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ >= 600
-__device__ __forceinline__
-#else
-ccl_device_inline
-#endif
-void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -199,7 +193,7 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG
/* Linear Congruential Generator */
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
{
/* implicit mod 2^32 */
rng = (1103515245*(rng) + 12345);