/**
 * Shaders that down-sample velocity buffer into squared tile of MB_TILE_DIVISOR pixels wide.
 * Outputs the largest motion vector in the tile area.
 * Also perform velocity resolve to speedup the convolution pass.
 *
 * Based on:
 * A Fast and Stable Feature-Aware Motion Blur Filter
 * by Jean-Philippe Guertin, Morgan McGuire, Derek Nowrouzezahrai
 *
 * Adapted from G3D Innovation Engine implementation.
 */

#pragma BLENDER_REQUIRE(common_math_geom_lib.glsl)
#pragma BLENDER_REQUIRE(eevee_velocity_lib.glsl)

shared uint payload_prev;
shared uint payload_next;
shared vec2 max_motion_prev;
shared vec2 max_motion_next;

/* Store velocity magnitude in the MSB and thread id in the LSB. */
uint pack_payload(vec2 motion, uvec2 thread_id)
{
  /* NOTE: We clamp max velocity to 16k pixels. */
  return (min(uint(ceil(length(motion))), 0xFFFFu) << 16u) | (thread_id.y << 8) | thread_id.x;
}

/* Return thread index from the payload. */
uvec2 unpack_payload(uint payload)
{
  return uvec2(payload & 0xFFu, (payload >> 8) & 0xFFu);
}

void main()
{
  if (all(equal(gl_LocalInvocationID.xy, uvec2(0)))) {
    payload_prev = 0u;
    payload_next = 0u;
  }
  barrier();

  uint local_payload_prev = 0u;
  uint local_payload_next = 0u;
  vec2 local_max_motion_prev;
  vec2 local_max_motion_next;

  ivec2 texel = min(ivec2(gl_GlobalInvocationID.xy), imageSize(velocity_img) - 1);

  vec2 render_size = vec2(imageSize(velocity_img).xy);
  vec2 uv = (vec2(texel) + 0.5) / render_size;
  float depth = texelFetch(depth_tx, texel, 0).r;
  vec4 motion = velocity_resolve(imageLoad(velocity_img, texel), uv, depth);
#ifdef FLATTEN_VIEWPORT
  /* imageLoad does not perform the swizzling like sampler does. Do it manually. */
  motion = motion.xyxy;
#endif

  /* Store resolved velocity to speedup the gather pass. Out of bounds writes are ignored.
   * Unfortunately, we cannot convert to pixel space here since it is also used by TAA and the
   * motion blur needs to remain optional. */
  imageStore(velocity_img, ivec2(gl_GlobalInvocationID.xy), velocity_pack(motion));
  /* Clip velocity to viewport bounds (in NDC space). */
  vec2 line_clip;
  line_clip.x = line_unit_square_intersect_dist_safe(uv * 2.0 - 1.0, motion.xy * 2.0);
  line_clip.y = line_unit_square_intersect_dist_safe(uv * 2.0 - 1.0, -motion.zw * 2.0);
  motion *= min(line_clip, vec2(1.0)).xxyy;
  /* Convert to pixel space. Note this is only for velocity tiles. */
  motion *= render_size.xyxy;
  /* Rescale to shutter relative motion for viewport. */
  motion *= motion_blur_buf.motion_scale.xxyy;

  uint sample_payload_prev = pack_payload(motion.xy, gl_LocalInvocationID.xy);
  if (local_payload_prev < sample_payload_prev) {
    local_payload_prev = sample_payload_prev;
    local_max_motion_prev = motion.xy;
  }

  uint sample_payload_next = pack_payload(motion.zw, gl_LocalInvocationID.xy);
  if (local_payload_next < sample_payload_next) {
    local_payload_next = sample_payload_next;
    local_max_motion_next = motion.zw;
  }

  /* Compare the local payload with the other threads. */
  atomicMax(payload_prev, local_payload_prev);
  atomicMax(payload_next, local_payload_next);
  barrier();

  /* Need to broadcast the result to another thread in order to issue a unique write. */
  if (all(equal(unpack_payload(payload_prev), gl_LocalInvocationID.xy))) {
    max_motion_prev = local_max_motion_prev;
  }
  if (all(equal(unpack_payload(payload_next), gl_LocalInvocationID.xy))) {
    max_motion_next = local_max_motion_next;
  }
  barrier();

  if (all(equal(gl_LocalInvocationID.xy, uvec2(0)))) {
    ivec2 tile_co = ivec2(gl_WorkGroupID.xy);
    imageStore(out_tiles_img, tile_co, vec4(max_motion_prev, max_motion_next));
  }
}