diff options
author | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-09-27 02:38:19 +0300 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-10-04 22:58:47 +0300 |
commit | 6da6f8d33f65b427162d0c8b13a5a5f5043bc8a5 (patch) | |
tree | be10ed89869fd67aa7d430897301f6ecbb2d577a /intern/cycles/kernel/kernel_passes.h | |
parent | 77f300e2a9289af026278171b51103bf485297e4 (diff) |
Cycles: CUDA faster rendering of small tiles, using multiple samples like OpenCL.
The work size is still very conservative, and this doesn't help for progressive
refine. For that we will need to render multiple tiles at the same time. But this
should already help for denoising renders that require too much memory with big
tiles, and just generally soften the performance dropoff with small tiles.
Differential Revision: https://developer.blender.org/D2856
Diffstat (limited to 'intern/cycles/kernel/kernel_passes.h')
-rw-r--r-- | intern/cycles/kernel/kernel_passes.h | 18 |
1 files changed, 11 insertions, 7 deletions
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 239598f7dab..644cc173571 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -16,19 +16,23 @@ CCL_NAMESPACE_BEGIN +#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__) +#define __ATOMIC_PASS_WRITE__ +#endif + ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value) { ccl_global float *buf = buffer; -#if defined(__SPLIT_KERNEL__) +#ifdef __ATOMIC_PASS_WRITE__ atomic_add_and_fetch_float(buf, value); #else *buf += value; -#endif /* __SPLIT_KERNEL__ */ +#endif } ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value) { -#if defined(__SPLIT_KERNEL__) +#ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -39,12 +43,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 #else ccl_global float3 *buf = (ccl_global float3*)buffer; *buf += value; -#endif /* __SPLIT_KERNEL__ */ +#endif } ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value) { -#if defined(__SPLIT_KERNEL__) +#ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -57,7 +61,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 #else ccl_global float4 *buf = (ccl_global float4*)buffer; *buf += value; -#endif /* __SPLIT_KERNEL__ */ +#endif } #ifdef __DENOISING_FEATURES__ @@ -70,7 +74,7 @@ ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer kernel_write_pass_float(buffer+1, value*value); } -# if defined(__SPLIT_KERNEL__) +# ifdef __ATOMIC_PASS_WRITE__ # define kernel_write_pass_float3_unaligned kernel_write_pass_float3 # else ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value) |