Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2021-09-20 18:59:20 +0300
committerBrecht Van Lommel <brecht@blender.org>2021-09-21 15:55:54 +0300
commit08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel/kernel_work_stealing.h
parentfa6b1007bad065440950cd67deb16a04f368856f (diff)
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
Diffstat (limited to 'intern/cycles/kernel/kernel_work_stealing.h')
-rw-r--r--intern/cycles/kernel/kernel_work_stealing.h87
1 files changed, 11 insertions, 76 deletions
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index d1602744f1d..fab0915c38e 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef __KERNEL_WORK_STEALING_H__
-#define __KERNEL_WORK_STEALING_H__
+#pragma once
CCL_NAMESPACE_BEGIN
@@ -24,21 +23,24 @@ CCL_NAMESPACE_BEGIN
*/
/* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
uint global_work_index,
ccl_private uint *x,
ccl_private uint *y,
ccl_private uint *sample)
{
-#ifdef __KERNEL_CUDA__
- /* Keeping threads for the same pixel together improves performance on CUDA. */
- uint sample_offset = global_work_index % tile->num_samples;
- uint pixel_offset = global_work_index / tile->num_samples;
-#else /* __KERNEL_CUDA__ */
+#if 0
+ /* Keep threads for the same sample together. */
uint tile_pixels = tile->w * tile->h;
uint sample_offset = global_work_index / tile_pixels;
uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
+#else
+ /* Keeping threads for the same pixel together.
+ * Appears to improve performance by a few % on CUDA and OptiX. */
+ uint sample_offset = global_work_index % tile->num_samples;
+ uint pixel_offset = global_work_index / tile->num_samples;
+#endif
+
uint y_offset = pixel_offset / tile->w;
uint x_offset = pixel_offset - y_offset * tile->w;
@@ -47,71 +49,4 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
*sample = tile->start_sample + sample_offset;
}
-#ifdef __KERNEL_OPENCL__
-# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#endif
-
-#ifdef __SPLIT_KERNEL__
-/* Returns true if there is work */
-ccl_device bool get_next_work_item(KernelGlobals *kg,
- ccl_global uint *work_pools,
- uint total_work_size,
- uint ray_index,
- ccl_private uint *global_work_index)
-{
- /* With a small amount of work there may be more threads than work due to
- * rounding up of global size, stop such threads immediately. */
- if (ray_index >= total_work_size) {
- return false;
- }
-
- /* Increase atomic work index counter in pool. */
- uint pool = ray_index / WORK_POOL_SIZE;
- uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
-
- /* Map per-pool work index to a global work index. */
- uint global_size = ccl_global_size(0) * ccl_global_size(1);
- kernel_assert(global_size % WORK_POOL_SIZE == 0);
- kernel_assert(ray_index < global_size);
-
- *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) +
- (work_index % WORK_POOL_SIZE);
-
- /* Test if all work for this pool is done. */
- return (*global_work_index < total_work_size);
-}
-
-ccl_device bool get_next_work(KernelGlobals *kg,
- ccl_global uint *work_pools,
- uint total_work_size,
- uint ray_index,
- ccl_private uint *global_work_index)
-{
- bool got_work = false;
- if (kernel_data.film.pass_adaptive_aux_buffer) {
- do {
- got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
- if (got_work) {
- ccl_global WorkTile *tile = &kernel_split_params.tile;
- uint x, y, sample;
- get_work_pixel(tile, *global_work_index, &x, &y, &sample);
- uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
- ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
- ccl_global float4 *aux = (ccl_global float4 *)(buffer +
- kernel_data.film.pass_adaptive_aux_buffer);
- if ((*aux).w == 0.0f) {
- break;
- }
- }
- } while (got_work);
- }
- else {
- got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
- }
- return got_work;
-}
-#endif
-
CCL_NAMESPACE_END
-
-#endif /* __KERNEL_WORK_STEALING_H__ */