intern/cycles/kernel/kernel_work_stealing.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

/*
 * Copyright 2011-2015 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef __KERNEL_WORK_STEALING_H__
#define __KERNEL_WORK_STEALING_H__

CCL_NAMESPACE_BEGIN

/*
 * Utility functions for work stealing
 */

/* Map global work index to tile, pixel X/Y and sample. */
ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
                                      uint global_work_index,
                                      ccl_private uint *x,
                                      ccl_private uint *y,
                                      ccl_private uint *sample)
{
#ifdef __KERNEL_CUDA__
  /* Keeping threads for the same pixel together improves performance on CUDA. */
  uint sample_offset = global_work_index % tile->num_samples;
  uint pixel_offset = global_work_index / tile->num_samples;
#else  /* __KERNEL_CUDA__ */
  uint tile_pixels = tile->w * tile->h;
  uint sample_offset = global_work_index / tile_pixels;
  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
#endif /* __KERNEL_CUDA__ */
  uint y_offset = pixel_offset / tile->w;
  uint x_offset = pixel_offset - y_offset * tile->w;

  *x = tile->x + x_offset;
  *y = tile->y + y_offset;
  *sample = tile->start_sample + sample_offset;
}

#ifdef __KERNEL_OPENCL__
#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif

#ifdef __SPLIT_KERNEL__
/* Returns true if there is work */
ccl_device bool get_next_work_item(KernelGlobals *kg,
                                   ccl_global uint *work_pools,
                                   uint total_work_size,
                                   uint ray_index,
                                   ccl_private uint *global_work_index)
{
  /* With a small amount of work there may be more threads than work due to
   * rounding up of global size, stop such threads immediately. */
  if (ray_index >= total_work_size) {
    return false;
  }

  /* Increase atomic work index counter in pool. */
  uint pool = ray_index / WORK_POOL_SIZE;
  uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);

  /* Map per-pool work index to a global work index. */
  uint global_size = ccl_global_size(0) * ccl_global_size(1);
  kernel_assert(global_size % WORK_POOL_SIZE == 0);
  kernel_assert(ray_index < global_size);

  *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) +
                       (work_index % WORK_POOL_SIZE);

  /* Test if all work for this pool is done. */
  return (*global_work_index < total_work_size);
}

ccl_device bool get_next_work(KernelGlobals *kg,
                              ccl_global uint *work_pools,
                              uint total_work_size,
                              uint ray_index,
                              ccl_private uint *global_work_index)
{
  bool got_work = false;
  if (kernel_data.film.pass_adaptive_aux_buffer) {
    do {
      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
      if (got_work) {
        ccl_global WorkTile *tile = &kernel_split_params.tile;
        uint x, y, sample;
        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
                                                       kernel_data.film.pass_adaptive_aux_buffer);
        if ((*aux).w == 0.0f) {
          break;
        }
      }
    } while (got_work);
  }
  else {
    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
  }
  return got_work;
}
#endif

CCL_NAMESPACE_END

#endif /* __KERNEL_WORK_STEALING_H__ */