source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_stabilize_comp.glsl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367


/**
 * Temporal Stabilization of the Depth of field input.
 * Corresponds to the TAA pass in the paper.
 * We actually duplicate the TAA logic but with a few changes:
 * - We run this pass at half resolution.
 * - We store CoC instead of Opacity in the alpha channel of the history.
 *
 * This is and adaption of the code found in eevee_film_lib.glsl
 *
 * Inputs:
 * - Output of setup pass (halfres).
 * Outputs:
 * - Stabilized Color and CoC (halfres).
 **/

#pragma BLENDER_REQUIRE(common_math_geom_lib.glsl)
#pragma BLENDER_REQUIRE(eevee_colorspace_lib.glsl)
#pragma BLENDER_REQUIRE(eevee_depth_of_field_lib.glsl)
#pragma BLENDER_REQUIRE(eevee_velocity_lib.glsl)

struct DofSample {
  vec4 color;
  float coc;
};

/* -------------------------------------------------------------------- */
/** \name LDS Cache
 * \{ */

const uint cache_size = gl_WorkGroupSize.x + 2;
shared vec4 color_cache[cache_size][cache_size];
shared float coc_cache[cache_size][cache_size];
/* Need 2 pixel border for depth. */
const uint cache_depth_size = gl_WorkGroupSize.x + 4;
shared float depth_cache[cache_depth_size][cache_depth_size];

void dof_cache_init()
{
  /**
   * Load enough values into LDS to perform the filter.
   *
   * ┌──────────────────────────────┐
   * │                              │  < Border texels that needs to be loaded.
   * │    x  x  x  x  x  x  x  x    │  ─┐
   * │    x  x  x  x  x  x  x  x    │   │
   * │    x  x  x  x  x  x  x  x    │   │
   * │    x  x  x  x  x  x  x  x    │   │ Thread Group Size 8x8.
   * │ L  L  L  L  L  x  x  x  x    │   │
   * │ L  L  L  L  L  x  x  x  x    │   │
   * │ L  L  L  L  L  x  x  x  x    │   │
   * │ L  L  L  L  L  x  x  x  x    │  ─┘
   * │ L  L  L  L  L                │  < Border texels that needs to be loaded.
   * └──────────────────────────────┘
   *   └───────────┘
   *    Load using 5x5 threads.
   */

  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
  for (int y = 0; y < 2; y++) {
    for (int x = 0; x < 2; x++) {
      /* 1 Pixel border. */
      if (all(lessThan(gl_LocalInvocationID.xy, uvec2(cache_size / 2u)))) {
        ivec2 offset = ivec2(x, y) * ivec2(cache_size / 2u);
        ivec2 cache_texel = ivec2(gl_LocalInvocationID.xy) + offset;
        ivec2 load_texel = clamp(texel + offset - 1, ivec2(0), textureSize(color_tx, 0) - 1);

        vec4 color = texelFetch(color_tx, load_texel, 0);
        color_cache[cache_texel.y][cache_texel.x] = colorspace_YCoCg_from_scene_linear(color);
        coc_cache[cache_texel.y][cache_texel.x] = texelFetch(coc_tx, load_texel, 0).x;
      }
      /* 2 Pixels border. */
      if (all(lessThan(gl_LocalInvocationID.xy, uvec2(cache_depth_size / 2u)))) {
        ivec2 offset = ivec2(x, y) * ivec2(cache_depth_size / 2u);
        ivec2 cache_texel = ivec2(gl_LocalInvocationID.xy) + offset;
        /* Depth is fullres. Load every 2 pixels. */
        ivec2 load_texel = clamp((texel + offset - 2) * 2, ivec2(0), textureSize(depth_tx, 0) - 1);

        depth_cache[cache_texel.y][cache_texel.x] = texelFetch(depth_tx, load_texel, 0).x;
      }
    }
  }
  barrier();
}

/* NOTE: Sample color space is already in YCoCg space. */
DofSample dof_fetch_input_sample(ivec2 offset)
{
  ivec2 coord = offset + 1 + ivec2(gl_LocalInvocationID.xy);
  return DofSample(color_cache[coord.y][coord.x], coc_cache[coord.y][coord.x]);
}

float dof_fetch_half_depth(ivec2 offset)
{
  ivec2 coord = offset + 2 + ivec2(gl_LocalInvocationID.xy);
  return depth_cache[coord.y][coord.x];
}

/** \} */

float dof_luma_weight(float luma)
{
  /* Slide 20 of "High Quality Temporal Supersampling" by Brian Karis at Siggraph 2014. */
  /* To preserve more details in dark areas, we use a bigger bias. */
  const float exposure_scale = 1.0; /* TODO. */
  return 1.0 / (4.0 + luma * exposure_scale);
}

float dof_bilateral_weight(float reference_coc, float sample_coc)
{
  /* NOTE: The difference between the cocs should be inside a abs() function,
   * but we follow UE4 implementation to improve how dithered transparency looks (see slide 19).
   * Effectively bleed background into foreground.
   * Compared to dof_bilateral_coc_weights() this saturates as 2x the reference CoC. */
  return saturate(1.0 - (sample_coc - reference_coc) / max(1.0, abs(reference_coc)));
}

DofSample dof_spatial_filtering()
{
  /* Plus (+) shape offsets. */
  const ivec2 plus_offsets[4] = ivec2[4](ivec2(-1, 0), ivec2(0, -1), ivec2(1, 0), ivec2(0, 1));
  DofSample center = dof_fetch_input_sample(ivec2(0));
  DofSample accum = DofSample(vec4(0.0), 0.0);
  float accum_weight = 0.0;
  for (int i = 0; i < 4; i++) {
    DofSample samp = dof_fetch_input_sample(plus_offsets[i]);
    float weight = dof_buf.filter_samples_weight[i] * dof_luma_weight(samp.color.x) *
                   dof_bilateral_weight(center.coc, samp.coc);

    accum.color += samp.color * weight;
    accum.coc += samp.coc * weight;
    accum_weight += weight;
  }
  /* Accumulate center sample last as it does not need bilateral_weights. */
  float weight = dof_buf.filter_center_weight * dof_luma_weight(center.color.x);
  accum.color += center.color * weight;
  accum.coc += center.coc * weight;
  accum_weight += weight;

  float rcp_weight = 1.0 / accum_weight;
  accum.color *= rcp_weight;
  accum.coc *= rcp_weight;
  return accum;
}

struct DofNeighborhoodMinMax {
  DofSample min;
  DofSample max;
};

/* Return history clipping bounding box in YCoCg color space. */
DofNeighborhoodMinMax dof_neighbor_boundbox()
{
  /* Plus (+) shape offsets. */
  const ivec2 plus_offsets[4] = ivec2[4](ivec2(-1, 0), ivec2(0, -1), ivec2(1, 0), ivec2(0, 1));
  /**
   * Simple bounding box calculation in YCoCg as described in:
   * "High Quality Temporal Supersampling" by Brian Karis at Siggraph 2014
   */
  DofSample min_c = dof_fetch_input_sample(ivec2(0));
  DofSample max_c = min_c;
  for (int i = 0; i < 4; i++) {
    DofSample samp = dof_fetch_input_sample(plus_offsets[i]);
    min_c.color = min(min_c.color, samp.color);
    max_c.color = max(max_c.color, samp.color);
    min_c.coc = min(min_c.coc, samp.coc);
    max_c.coc = max(max_c.coc, samp.coc);
  }
  /* (Slide 32) Simple clamp to min/max of 8 neighbors results in 3x3 box artifacts.
   * Round bbox shape by averaging 2 different min/max from 2 different neighborhood. */
  DofSample min_c_3x3 = min_c;
  DofSample max_c_3x3 = max_c;
  const ivec2 corners[4] = ivec2[4](ivec2(-1, -1), ivec2(1, -1), ivec2(-1, 1), ivec2(1, 1));
  for (int i = 0; i < 4; i++) {
    DofSample samp = dof_fetch_input_sample(corners[i]);
    min_c_3x3.color = min(min_c_3x3.color, samp.color);
    max_c_3x3.color = max(max_c_3x3.color, samp.color);
    min_c_3x3.coc = min(min_c_3x3.coc, samp.coc);
    max_c_3x3.coc = max(max_c_3x3.coc, samp.coc);
  }
  min_c.color = (min_c.color + min_c_3x3.color) * 0.5;
  max_c.color = (max_c.color + max_c_3x3.color) * 0.5;
  min_c.coc = (min_c.coc + min_c_3x3.coc) * 0.5;
  max_c.coc = (max_c.coc + max_c_3x3.coc) * 0.5;

  return DofNeighborhoodMinMax(min_c, max_c);
}

/* Returns motion in pixel space to retrieve the pixel history. */
vec2 dof_pixel_history_motion_vector(ivec2 texel_sample)
{
  /**
   * Dilate velocity by using the nearest pixel in a cross pattern.
   * "High Quality Temporal Supersampling" by Brian Karis at Siggraph 2014 (Slide 27)
   */
  const ivec2 corners[4] = ivec2[4](ivec2(-2, -2), ivec2(2, -2), ivec2(-2, 2), ivec2(2, 2));
  float min_depth = dof_fetch_half_depth(ivec2(0));
  ivec2 nearest_texel = ivec2(0);
  for (int i = 0; i < 4; i++) {
    float depth = dof_fetch_half_depth(corners[i]);
    if (min_depth > depth) {
      min_depth = depth;
      nearest_texel = corners[i];
    }
  }
  /* Convert to full resolution buffer pixel. */
  ivec2 velocity_texel = (texel_sample + nearest_texel) * 2;
  velocity_texel = clamp(velocity_texel, ivec2(0), textureSize(velocity_tx, 0).xy - 1);
  vec4 vector = velocity_resolve(velocity_tx, velocity_texel, min_depth);
  /* Transform to **half** pixel space. */
  return vector.xy * vec2(textureSize(color_tx, 0));
}

/* Load color using a special filter to avoid losing detail.
 * \a texel is sample position with subpixel accuracy. */
DofSample dof_sample_history(vec2 input_texel)
{
#if 1 /* Bilinar. */
  vec2 uv = vec2(input_texel + 0.5) / textureSize(in_history_tx, 0);
  vec4 color = textureLod(in_history_tx, uv, 0.0);

#else /* Catmull Rom interpolation. 5 Bilinear Taps. */
  vec2 center_texel;
  vec2 inter_texel = modf(input_texel, center_texel);
  vec2 weights[4];
  film_get_catmull_rom_weights(inter_texel, weights);

  /**
   * Use optimized version by leveraging bilinear filtering from hardware sampler and by removing
   * corner taps.
   * From "Filmic SMAA" by Jorge Jimenez at Siggraph 2016
   * http://advances.realtimerendering.com/s2016/Filmic%20SMAA%20v7.pptx
   */
  center_texel += 0.5;

  /* Slide 92. */
  vec2 weight_12 = weights[1] + weights[2];
  vec2 uv_12 = (center_texel + weights[2] / weight_12) * film_buf.extent_inv;
  vec2 uv_0 = (center_texel - 1.0) * film_buf.extent_inv;
  vec2 uv_3 = (center_texel + 2.0) * film_buf.extent_inv;

  vec4 color;
  vec4 weight_cross = weight_12.xyyx * vec4(weights[0].yx, weights[3].xy);
  float weight_center = weight_12.x * weight_12.y;

  color = textureLod(in_history_tx, uv_12, 0.0) * weight_center;
  color += textureLod(in_history_tx, vec2(uv_12.x, uv_0.y), 0.0) * weight_cross.x;
  color += textureLod(in_history_tx, vec2(uv_0.x, uv_12.y), 0.0) * weight_cross.y;
  color += textureLod(in_history_tx, vec2(uv_3.x, uv_12.y), 0.0) * weight_cross.z;
  color += textureLod(in_history_tx, vec2(uv_12.x, uv_3.y), 0.0) * weight_cross.w;
  /* Re-normalize for the removed corners. */
  color /= (weight_center + sum(weight_cross));
#endif
  /* NOTE(fclem): Opacity is wrong on purpose. Final Opacity does not rely on history. */
  return DofSample(color.xyzz, color.w);
}

/* Modulate the history color to avoid ghosting artifact. */
DofSample dof_amend_history(DofNeighborhoodMinMax bbox, DofSample history, DofSample src)
{
#if 0
  /* Clip instead of clamping to avoid color accumulating in the AABB corners. */
  vec3 clip_dir = src.color.rgb - history.color.rgb;

  float t = line_aabb_clipping_dist(
      history.color.rgb, clip_dir, bbox.min.color.rgb, bbox.max.color.rgb);
  history.color.rgb += clip_dir * saturate(t);
#else
  /* More responsive. */
  history.color = clamp(history.color, bbox.min.color, bbox.max.color);
#endif
  /* Clamp CoC to reduce convergence time. Otherwise the result is laggy. */
  history.coc = clamp(history.coc, bbox.min.coc, bbox.max.coc);

  return history;
}

float dof_history_blend_factor(
    float velocity, vec2 texel, DofNeighborhoodMinMax bbox, DofSample src, DofSample dst)
{
  float luma_min = bbox.min.color.x;
  float luma_max = bbox.max.color.x;
  float luma_incoming = src.color.x;
  float luma_history = dst.color.x;

  /* 5% of incoming color by default. */
  float blend = 0.05;
  /* Blend less history if the pixel has substantial velocity. */
  /* NOTE(fclem): velocity threshold multiplied by 2 because of half resolution. */
  blend = mix(blend, 0.20, saturate(velocity * 0.02 * 2.0));
  /**
   * "High Quality Temporal Supersampling" by Brian Karis at Siggraph 2014 (Slide 43)
   * Bias towards history if incoming pixel is near clamping. Reduces flicker.
   */
  float distance_to_luma_clip = min_v2(vec2(luma_history - luma_min, luma_max - luma_history));
  /* Divide by bbox size to get a factor. 2 factor to compensate the line above. */
  distance_to_luma_clip *= 2.0 * safe_rcp(luma_max - luma_min);
  /* Linearly blend when history gets below to 25% of the bbox size. */
  blend *= saturate(distance_to_luma_clip * 4.0 + 0.1);
  /* Progressively discard history until history CoC is twice as big as the filtered CoC.
   * Note we use absolute diff here because we are not comparing neighbors and thus do not risk to
   * dilate thin features like hair (slide 19). */
  float coc_diff_ratio = saturate(abs(src.coc - dst.coc) / max(1.0, abs(src.coc)));
  blend = mix(blend, 1.0, coc_diff_ratio);
  /* Discard out of view history. */
  if (any(lessThan(texel, vec2(0))) ||
      any(greaterThanEqual(texel, vec2(imageSize(out_history_img))))) {
    blend = 1.0;
  }
  /* Discard history if invalid. */
  if (use_history == false) {
    blend = 1.0;
  }
  return blend;
}

void main()
{
  dof_cache_init();

  ivec2 src_texel = ivec2(gl_GlobalInvocationID.xy);

  /**
   * Naming convention is taken from the film implementation.
   * SRC is incoming new data.
   * DST is history data.
   */
  DofSample src = dof_spatial_filtering();

  /* Reproject by finding where this pixel was in the previous frame. */
  vec2 motion = dof_pixel_history_motion_vector(src_texel);
  vec2 history_texel = vec2(src_texel) + motion;

  float velocity = length(motion);

  DofSample dst = dof_sample_history(history_texel);

  /* Get local color bounding box of source neighborhood. */
  DofNeighborhoodMinMax bbox = dof_neighbor_boundbox();

  float blend = dof_history_blend_factor(velocity, history_texel, bbox, src, dst);

  dst = dof_amend_history(bbox, dst, src);

  /* Luma weighted blend to reduce flickering. */
  float weight_dst = dof_luma_weight(dst.color.x) * (1.0 - blend);
  float weight_src = dof_luma_weight(src.color.x) * (blend);

  DofSample result;
  /* Weighted blend. */
  result.color = vec4(dst.color.rgb, dst.coc) * weight_dst +
                 vec4(src.color.rgb, src.coc) * weight_src;
  result.color /= weight_src + weight_dst;

  /* Save history for next iteration. Still in YCoCg space with CoC in alpha. */
  imageStore(out_history_img, src_texel, result.color);

  /* Un-swizzle. */
  result.coc = result.color.a;
  /* Clamp opacity since we don't store it in history. */
  result.color.a = clamp(src.color.a, bbox.min.color.a, bbox.max.color.a);

  result.color = colorspace_scene_linear_from_YCoCg(result.color);

  imageStore(out_color_img, src_texel, result.color);
  imageStore(out_coc_img, src_texel, vec4(result.coc));
}