From 47da8dcbcad4ccc5349bc303394e1d01d1c822c5 Mon Sep 17 00:00:00 2001 From: Stefan Werner Date: Thu, 14 Mar 2019 11:45:58 +0100 Subject: Cycles: Improved thread order for better CUDA performance. This patch puts threads that render the same pixel closer together, as opposed to threads that render the same sample. Thus threads within a warp are more coherent in memory access and control flow, leading to performance improvements. Example benchmarks on a Quadro RTX4000 (WDDM) on Windows 10: Koro: 4:23 -> 3:46 BMW: 1:18 -> 1:25 Barbershop Interior: 17:52 -> 14:55 Classroom: 4:37 -> 3:45 Performance differences on OpenCL/AMD were hit and miss, some scenes became faster, others lost significantly. Therefore, this is kept as CUDA only change for now. --- intern/cycles/kernel/kernel_work_stealing.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'intern/cycles/kernel') diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 0c2d9379b63..9667156eaf5 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -66,9 +66,15 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, ccl_private uint *y, ccl_private uint *sample) { +#ifdef __KERNEL_CUDA__ + /* Keeping threads for the same pixel together improves performance on CUDA. */ + uint sample_offset = global_work_index % tile->num_samples; + uint pixel_offset = global_work_index / tile->num_samples; +#else /* __KERNEL_CUDA__ */ uint tile_pixels = tile->w * tile->h; uint sample_offset = global_work_index / tile_pixels; uint pixel_offset = global_work_index - sample_offset * tile_pixels; +#endif /* __KERNEL_CUDA__ */ uint y_offset = pixel_offset / tile->w; uint x_offset = pixel_offset - y_offset * tile->w; -- cgit v1.2.3