diff options
author | Sergey Sharybin <sergey.vfx@gmail.com> | 2015-03-13 10:14:43 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey.vfx@gmail.com> | 2015-03-13 10:38:14 +0300 |
commit | 61eab743f1377fdfcf44f2e4928290a3fc4ccfea (patch) | |
tree | 6dff417678cc61e7096c03f2a6c05dd5e33d42c5 | |
parent | aa4cb95a5c8569704f166cfd6d8f65606502ea40 (diff) |
Cycles: Optimization for CMJ in CUDA kernels
Two things:
- Use intrinsics for clz/ctz (ctz is implemented via ffs()).
- Use faster sqrt() function which precision is enough for
integer values.
-rw-r--r-- | intern/cycles/kernel/kernel_jitter.h | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index 6aa29311ee6..6953f005ea9 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -47,6 +47,8 @@ ccl_device_inline int cmj_fast_div_pow2(int a, int b) # else return a >> __builtin_ctz(b); # endif +#elif defined(__KERNEL_CUDA__) + return a >> (__ffs(b) - 1); #else return a/b; #endif @@ -63,6 +65,8 @@ ccl_device_inline uint cmj_w_mask(uint w) # else return ((1 << (32 - __builtin_clz(w))) - 1); # endif +#elif defined(__KERNEL_CUDA__) + return ((1 << (32 - __clz(w))) - 1); #else w |= w >> 1; w |= w >> 2; @@ -167,7 +171,11 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) { kernel_assert(s < N); +#if defined(__KERNEL_CUDA__) + int m = float_to_int(__fsqrt_ru(N)); +#else int m = float_to_int(sqrtf(N)); +#endif int n = (N + m - 1)/m; float invN = 1.0f/N; float invm = 1.0f/m; |