1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
/**
* Shader that down-sample depth buffer, creating a Hierarchical-Z buffer.
* Saves max value of each 2x2 texel in the mipmap above the one we are
* rendering to. Adapted from
* http://rastergrid.com/blog/2010/10/hierarchical-z-map-based-occlusion-culling/
*
* Major simplification has been made since we pad the buffer to always be
* bigger than input to avoid mipmapping misalignment.
*
* Start by copying the base level by quad loading the depth.
* Then each thread compute it's local depth for level 1.
* After that we use shared variables to do inter thread communication and
* downsample to max level.
*/
#pragma BLENDER_REQUIRE(common_math_lib.glsl)
shared float local_depths[gl_WorkGroupSize.y][gl_WorkGroupSize.x];
/* Load values from the previous lod level. */
vec4 load_local_depths(ivec2 pixel)
{
pixel *= 2;
return vec4(local_depths[pixel.y + 1][pixel.x + 0],
local_depths[pixel.y + 1][pixel.x + 1],
local_depths[pixel.y + 0][pixel.x + 1],
local_depths[pixel.y + 0][pixel.x + 0]);
}
void store_local_depth(ivec2 pixel, float depth)
{
local_depths[pixel.y][pixel.x] = depth;
}
void main()
{
ivec2 local_px = ivec2(gl_LocalInvocationID.xy);
/* Bottom left corner of the kernel. */
ivec2 kernel_origin = ivec2(gl_WorkGroupSize.xy * gl_WorkGroupID.xy);
/* Copy level 0. */
ivec2 src_px = ivec2(kernel_origin + local_px) * 2;
vec2 samp_co = (vec2(src_px) + 0.5) / vec2(textureSize(depth_tx, 0));
vec4 samp = textureGather(depth_tx, samp_co);
if (update_mip_0) {
imageStore(out_mip_0, src_px + ivec2(0, 1), samp.xxxx);
imageStore(out_mip_0, src_px + ivec2(1, 1), samp.yyyy);
imageStore(out_mip_0, src_px + ivec2(1, 0), samp.zzzz);
imageStore(out_mip_0, src_px + ivec2(0, 0), samp.wwww);
}
/* Level 1. (No load) */
float max_depth = max_v4(samp);
ivec2 dst_px = ivec2(kernel_origin + local_px);
imageStore(out_mip_1, dst_px, vec4(max_depth));
store_local_depth(local_px, max_depth);
/* Level 2-5. */
bool active_thread;
int mask_shift = 1;
#define downsample_level(out_mip__, lod_) \
active_thread = all(lessThan(local_px, gl_WorkGroupSize.xy >> uint(mask_shift))); \
barrier(); /* Wait for previous writes to finish. */ \
if (active_thread) { \
max_depth = max_v4(load_local_depths(local_px)); \
dst_px = ivec2((kernel_origin >> mask_shift) + local_px); \
imageStore(out_mip__, dst_px, vec4(max_depth)); \
} \
barrier(); /* Wait for previous reads to finish. */ \
if (active_thread) { \
store_local_depth(local_px, max_depth); \
} \
mask_shift++;
downsample_level(out_mip_2, 2);
downsample_level(out_mip_3, 3);
downsample_level(out_mip_4, 4);
downsample_level(out_mip_5, 5);
/* Since we pad the destination texture, the mip size is equal to the dispatch size. */
uint tile_count = uint(imageSize(out_mip_5).x * imageSize(out_mip_5).y);
/* Let the last tile handle the remaining LOD. */
bool last_tile = atomicAdd(finished_tile_counter, 1u) + 1u < tile_count;
if (last_tile == false) {
return;
}
finished_tile_counter = 0u;
ivec2 iter = divide_ceil(imageSize(out_mip_5), ivec2(gl_WorkGroupSize * 2u));
ivec2 image_border = imageSize(out_mip_5) - 1;
for (int y = 0; y < iter.y; y++) {
for (int x = 0; x < iter.x; x++) {
/* Load result of the other work groups. */
kernel_origin = ivec2(gl_WorkGroupSize) * ivec2(x, y);
src_px = ivec2(kernel_origin + local_px) * 2;
vec4 samp;
samp.x = imageLoad(out_mip_5, min(src_px + ivec2(0, 1), image_border)).x;
samp.y = imageLoad(out_mip_5, min(src_px + ivec2(1, 1), image_border)).x;
samp.z = imageLoad(out_mip_5, min(src_px + ivec2(1, 0), image_border)).x;
samp.w = imageLoad(out_mip_5, min(src_px + ivec2(0, 0), image_border)).x;
/* Level 6. */
float max_depth = max_v4(samp);
ivec2 dst_px = ivec2(kernel_origin + local_px);
imageStore(out_mip_6, dst_px, vec4(max_depth));
store_local_depth(local_px, max_depth);
mask_shift = 1;
/* Level 7. */
downsample_level(out_mip_7, 7);
/* Limited by OpenGL maximum of 8 image slot. */
// downsample_level(out_mip_8, 8);
// downsample_level(out_mip_9, 9);
// downsample_level(out_mip_10, 10);
}
}
}
|