8 files changed, 596 insertions, 2 deletions
diff --git a/source/blender/gpu/CMakeLists.txt b/source/blender/gpu/CMakeLists.txt
index 47d4feb7ec9..cb5bb4331f9 100644
--- a/source/blender/gpu/CMakeLists.txt
+++ b/source/blender/gpu/CMakeLists.txt
@@ -345,8 +345,11 @@ set(GLSL_SRC
   shaders/compositor/compositor_screen_lens_distortion.glsl
   shaders/compositor/compositor_set_alpha.glsl
   shaders/compositor/compositor_split_viewer.glsl
+  shaders/compositor/compositor_symmetric_blur.glsl
+  shaders/compositor/compositor_symmetric_separable_blur.glsl
 
   shaders/compositor/library/gpu_shader_compositor_alpha_over.glsl
+  shaders/compositor/library/gpu_shader_compositor_blur_common.glsl
   shaders/compositor/library/gpu_shader_compositor_bright_contrast.glsl
   shaders/compositor/library/gpu_shader_compositor_channel_matte.glsl
   shaders/compositor/library/gpu_shader_compositor_chroma_matte.glsl
@@ -620,6 +623,8 @@ set(SRC_SHADER_CREATE_INFOS
   shaders/compositor/infos/compositor_screen_lens_distortion_info.hh
   shaders/compositor/infos/compositor_set_alpha_info.hh
   shaders/compositor/infos/compositor_split_viewer_info.hh
+  shaders/compositor/infos/compositor_symmetric_blur_info.hh
+  shaders/compositor/infos/compositor_symmetric_separable_blur_info.hh
 )
 
 set(SRC_SHADER_CREATE_INFOS_MTL
diff --git a/source/blender/gpu/shaders/compositor/compositor_symmetric_blur.glsl b/source/blender/gpu/shaders/compositor/compositor_symmetric_blur.glsl
new file mode 100644
index 00000000000..df08991a35c
--- /dev/null
+++ b/source/blender/gpu/shaders/compositor/compositor_symmetric_blur.glsl
@@ -0,0 +1,77 @@
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_blur_common.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+vec4 load_input(ivec2 texel)
+{
+  vec4 color;
+  if (extend_bounds) {
+    /* If bounds are extended, then we treat the input as padded by a radius amount of pixels. So
+     * we load the input with an offset by the radius amount and fallback to a transparent color if
+     * it is out of bounds. Notice that we subtract 1 because the weights texture have an extra
+     * center weight, see the SymmetricBlurWeights for more information. */
+    ivec2 blur_size = texture_size(weights_tx) - 1;
+    color = texture_load(input_tx, texel - blur_size, vec4(0.0));
+  }
+  else {
+    color = texture_load(input_tx, texel);
+  }
+
+  if (gamma_correct) {
+    color = gamma_correct_blur_input(color);
+  }
+
+  return color;
+}
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  vec4 accumulated_color = vec4(0.0);
+
+  /* First, compute the contribution of the center pixel. */
+  vec4 center_color = load_input(texel);
+  accumulated_color += center_color * texture_load(weights_tx, ivec2(0)).x;
+
+  ivec2 weights_size = texture_size(weights_tx);
+
+  /* Then, compute the contributions of the pixels along the x axis of the filter, noting that the
+   * weights texture only stores the weights for the positive half, but since the filter is
+   * symmetric, the same weight is used for the negative half and we add both of their
+   * contributions. */
+  for (int x = 1; x < weights_size.x; x++) {
+    float weight = texture_load(weights_tx, ivec2(x, 0)).x;
+    accumulated_color += load_input(texel + ivec2(x, 0)) * weight;
+    accumulated_color += load_input(texel + ivec2(-x, 0)) * weight;
+  }
+
+  /* Then, compute the contributions of the pixels along the y axis of the filter, noting that the
+   * weights texture only stores the weights for the positive half, but since the filter is
+   * symmetric, the same weight is used for the negative half and we add both of their
+   * contributions. */
+  for (int y = 1; y < weights_size.y; y++) {
+    float weight = texture_load(weights_tx, ivec2(0, y)).x;
+    accumulated_color += load_input(texel + ivec2(0, y)) * weight;
+    accumulated_color += load_input(texel + ivec2(0, -y)) * weight;
+  }
+
+  /* Finally, compute the contributions of the pixels in the four quadrants of the filter, noting
+   * that the weights texture only stores the weights for the upper right quadrant, but since the
+   * filter is symmetric, the same weight is used for the rest of the quadrants and we add all four
+   * of their contributions. */
+  for (int y = 1; y < weights_size.y; y++) {
+    for (int x = 1; x < weights_size.x; x++) {
+      float weight = texture_load(weights_tx, ivec2(x, y)).x;
+      accumulated_color += load_input(texel + ivec2(x, y)) * weight;
+      accumulated_color += load_input(texel + ivec2(-x, y)) * weight;
+      accumulated_color += load_input(texel + ivec2(x, -y)) * weight;
+      accumulated_color += load_input(texel + ivec2(-x, -y)) * weight;
+    }
+  }
+
+  if (gamma_correct) {
+    accumulated_color = gamma_uncorrect_blur_output(accumulated_color);
+  }
+
+  imageStore(output_img, texel, accumulated_color);
+}
diff --git a/source/blender/gpu/shaders/compositor/compositor_symmetric_separable_blur.glsl b/source/blender/gpu/shaders/compositor/compositor_symmetric_separable_blur.glsl
new file mode 100644
index 00000000000..ab0c7baa787
--- /dev/null
+++ b/source/blender/gpu/shaders/compositor/compositor_symmetric_separable_blur.glsl
@@ -0,0 +1,53 @@
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_blur_common.glsl)
+#pragma BLENDER_REQUIRE(gpu_shader_compositor_texture_utilities.glsl)
+
+vec4 load_input(ivec2 texel)
+{
+  vec4 color;
+  if (extend_bounds) {
+    /* If bounds are extended, then we treat the input as padded by a radius amount of pixels. So
+     * we load the input with an offset by the radius amount and fallback to a transparent color if
+     * it is out of bounds. Notice that we subtract 1 because the weights texture have an extra
+     * center weight, see the SymmetricSeparableBlurWeights for more information. */
+    int blur_size = texture_size(weights_tx) - 1;
+    color = texture_load(input_tx, texel - ivec2(blur_size, 0), vec4(0.0));
+  }
+  else {
+    color = texture_load(input_tx, texel);
+  }
+
+  if (gamma_correct_input) {
+    color = gamma_correct_blur_input(color);
+  }
+
+  return color;
+}
+
+void main()
+{
+  ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+
+  vec4 accumulated_color = vec4(0.0);
+
+  /* First, compute the contribution of the center pixel. */
+  vec4 center_color = load_input(texel);
+  accumulated_color += center_color * texture_load(weights_tx, 0).x;
+
+  /* Then, compute the contributions of the pixel to the right and left, noting that the
+   * weights texture only stores the weights for the positive half, but since the filter is
+   * symmetric, the same weight is used for the negative half and we add both of their
+   * contributions. */
+  for (int i = 1; i < texture_size(weights_tx); i++) {
+    float weight = texture_load(weights_tx, i).x;
+    accumulated_color += load_input(texel + ivec2(i, 0)) * weight;
+    accumulated_color += load_input(texel + ivec2(-i, 0)) * weight;
+  }
+
+  if (gamma_uncorrect_output) {
+    accumulated_color = gamma_uncorrect_blur_output(accumulated_color);
+  }
+
+  /* Write the color using the transposed texel. See the execute_separable_blur_horizontal_pass
+   * method for more information on the rational behind this. */
+  imageStore(output_img, texel.yx, accumulated_color);
+}
diff --git a/source/blender/gpu/shaders/compositor/infos/compositor_symmetric_blur_info.hh b/source/blender/gpu/shaders/compositor/infos/compositor_symmetric_blur_info.hh
new file mode 100644
index 00000000000..8ba2b4e04ef
--- /dev/null
+++ b/source/blender/gpu/shaders/compositor/infos/compositor_symmetric_blur_info.hh
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "gpu_shader_create_info.hh"
+
+GPU_SHADER_CREATE_INFO(compositor_symmetric_blur)
+    .local_group_size(16, 16)
+    .push_constant(Type::BOOL, "extend_bounds")
+    .push_constant(Type::BOOL, "gamma_correct")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .sampler(1, ImageType::FLOAT_2D, "weights_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_symmetric_blur.glsl")
+    .do_static_compilation(true);
diff --git a/source/blender/gpu/shaders/compositor/infos/compositor_symmetric_separable_blur_info.hh b/source/blender/gpu/shaders/compositor/infos/compositor_symmetric_separable_blur_info.hh
new file mode 100644
index 00000000000..57247dba4b8
--- /dev/null
+++ b/source/blender/gpu/shaders/compositor/infos/compositor_symmetric_separable_blur_info.hh
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "gpu_shader_create_info.hh"
+
+GPU_SHADER_CREATE_INFO(compositor_symmetric_separable_blur)
+    .local_group_size(16, 16)
+    .push_constant(Type::BOOL, "extend_bounds")
+    .push_constant(Type::BOOL, "gamma_correct_input")
+    .push_constant(Type::BOOL, "gamma_uncorrect_output")
+    .sampler(0, ImageType::FLOAT_2D, "input_tx")
+    .sampler(1, ImageType::FLOAT_1D, "weights_tx")
+    .image(0, GPU_RGBA16F, Qualifier::WRITE, ImageType::FLOAT_2D, "output_img")
+    .compute_source("compositor_symmetric_separable_blur.glsl")
+    .do_static_compilation(true);
diff --git a/source/blender/gpu/shaders/compositor/library/gpu_shader_compositor_blur_common.glsl b/source/blender/gpu/shaders/compositor/library/gpu_shader_compositor_blur_common.glsl
new file mode 100644
index 00000000000..e404c03bbb0
--- /dev/null
+++ b/source/blender/gpu/shaders/compositor/library/gpu_shader_compositor_blur_common.glsl
@@ -0,0 +1,32 @@
+/* Preprocess the input of the blur filter by squaring it in its alpha straight form, assuming the
+ * given color is alpha premultiplied. */
+vec4 gamma_correct_blur_input(vec4 color)
+{
+  /* Unpremultiply alpha. */
+  color.rgb /= color.a > 0.0 ? color.a : 1.0;
+
+  /* Square color channel if it is positive, otherwise zero it. */
+  color.rgb *= mix(color.rgb, vec3(0.0), lessThan(color.rgb, vec3(0.0)));
+
+  /* Premultiply alpha to undo previous alpha unpremultiplication. */
+  color.rgb *= color.a > 0.0 ? color.a : 1.0;
+
+  return color;
+}
+
+/* Postprocess the output of the blur filter by taking its square root it in its alpha straight
+ * form, assuming the given color is alpha premultiplied. This essential undoes the processing done
+ * by the gamma_correct_blur_input function. */
+vec4 gamma_uncorrect_blur_output(vec4 color)
+{
+  /* Unpremultiply alpha. */
+  color.rgb /= color.a > 0.0 ? color.a : 1.0;
+
+  /* Take the square root of the color channel if it is positive, otherwise zero it. */
+  color.rgb = mix(sqrt(color.rgb), vec3(0.0), lessThan(color.rgb, vec3(0.0)));
+
+  /* Premultiply alpha to undo previous alpha unpremultiplication. */
+  color.rgb *= color.a > 0.0 ? color.a : 1.0;
+
+  return color;
+}
diff --git a/source/blender/nodes/composite/nodes/node_composite_blur.cc b/source/blender/nodes/composite/nodes/node_composite_blur.cc
index cb1d93fe10b..630f18361e3 100644
--- a/source/blender/nodes/composite/nodes/node_composite_blur.cc
+++ b/source/blender/nodes/composite/nodes/node_composite_blur.cc
@@ -5,12 +5,27 @@
  * \ingroup cmpnodes
  */
 
+#include <cstdint>
+
+#include "BLI_array.hh"
+#include "BLI_assert.h"
+#include "BLI_index_range.hh"
+#include "BLI_math_base.hh"
+#include "BLI_math_vec_types.hh"
+#include "BLI_math_vector.hh"
+
 #include "RNA_access.h"
 
 #include "UI_interface.h"
 #include "UI_resources.h"
 
+#include "RE_pipeline.h"
+
+#include "GPU_state.h"
+#include "GPU_texture.h"
+
 #include "COM_node_operation.hh"
+#include "COM_utilities.hh"
 
 #include "node_composite_util.hh"
 
@@ -18,6 +33,8 @@
 
 namespace blender::nodes::node_composite_blur_cc {
 
+NODE_STORAGE_FUNCS(NodeBlurData)
+
 static void cmp_node_blur_declare(NodeDeclarationBuilder &b)
 {
   b.add_input<decl::Color>(N_("Image")).default_value({1.0f, 1.0f, 1.0f, 1.0f});
@@ -75,13 +92,395 @@ static void node_composit_buts_blur(uiLayout *layout, bContext *UNUSED(C), Point
 
 using namespace blender::realtime_compositor;
 
+/* A helper class that computes and caches a 1D GPU texture containing the weights of the separable
+ * filter of the given type and radius. The filter is assumed to be symmetric, because the filter
+ * functions are all even functions. Consequently, only the positive half of the filter is computed
+ * and the shader takes that into consideration. */
+class SymmetricSeparableBlurWeights {
+ private:
+  float radius_ = 1.0f;
+  int type_ = R_FILTER_GAUSS;
+  GPUTexture *texture_ = nullptr;
+
+ public:
+  ~SymmetricSeparableBlurWeights()
+  {
+    if (texture_) {
+      GPU_texture_free(texture_);
+    }
+  }
+
+  /* Check if a texture containing the weights was already computed for the given filter type and
+   * radius. If such texture exists, do nothing, otherwise, free the already computed texture and
+   * recompute it with the given filter type and radius. */
+  void update(float radius, int type)
+  {
+    if (texture_ && type == type_ && radius == radius_) {
+      return;
+    }
+
+    if (texture_) {
+      GPU_texture_free(texture_);
+    }
+
+    /* The size of filter is double the radius plus 1, but since the filter is symmetric, we only
+     * compute half of it and no doubling happens. We add 1 to make sure the filter size is always
+     * odd and there is a center weight. */
+    const int size = math::ceil(radius) + 1;
+    Array<float> weights(size);
+
+    float sum = 0.0f;
+
+    /* First, compute the center weight. */
+    const float center_weight = RE_filter_value(type, 0.0f);
+    weights[0] = center_weight;
+    sum += center_weight;
+
+    /* Second, compute the other weights in the positive direction, making sure to add double the
+     * weight to the sum of weights because the filter is symmetric and we only loop over half of
+     * it. Skip the center weight already computed by dropping the front index. */
+    const float scale = radius > 0.0f ? 1.0f / radius : 0.0f;
+    for (const int i : weights.index_range().drop_front(1)) {
+      const float weight = RE_filter_value(type, i * scale);
+      weights[i] = weight;
+      sum += weight * 2.0f;
+    }
+
+    /* Finally, normalize the weights. */
+    for (const int i : weights.index_range()) {
+      weights[i] /= sum;
+    }
+
+    texture_ = GPU_texture_create_1d("Weights", size, 1, GPU_R16F, weights.data());
+
+    type_ = type;
+    radius_ = radius;
+  }
+
+  void bind_as_texture(GPUShader *shader, const char *texture_name)
+  {
+    const int texture_image_unit = GPU_shader_get_texture_binding(shader, texture_name);
+    GPU_texture_bind(texture_, texture_image_unit);
+  }
+
+  void unbind_as_texture()
+  {
+    GPU_texture_unbind(texture_);
+  }
+};
+
+/* A helper class that computes and caches a 2D GPU texture containing the weights of the filter of
+ * the given type and radius. The filter is assumed to be symmetric, because the filter functions
+ * are evaluated on the normalized distance to the center. Consequently, only the upper right
+ * quadrant are computed and the shader takes that into consideration. */
+class SymmetricBlurWeights {
+ private:
+  int type_ = R_FILTER_GAUSS;
+  float2 radius_ = float2(1.0f);
+  GPUTexture *texture_ = nullptr;
+
+ public:
+  ~SymmetricBlurWeights()
+  {
+    if (texture_) {
+      GPU_texture_free(texture_);
+    }
+  }
+
+  /* Check if a texture containing the weights was already computed for the given filter type and
+   * radius. If such texture exists, do nothing, otherwise, free the already computed texture and
+   * recompute it with the given filter type and radius. */
+  void update(float2 radius, int type)
+  {
+    if (texture_ && type == type_ && radius == radius_) {
+      return;
+    }
+
+    if (texture_) {
+      GPU_texture_free(texture_);
+    }
+
+    /* The full size of filter is double the radius plus 1, but since the filter is symmetric, we
+     * only compute a single quadrant of it and so no doubling happens. We add 1 to make sure the
+     * filter size is always odd and there is a center weight. */
+    const float2 scale = math::safe_divide(float2(1.0f), radius);
+    const int2 size = int2(math::ceil(radius)) + int2(1);
+    Array<float> weights(size.x * size.y);
+
+    float sum = 0.0f;
+
+    /* First, compute the center weight. */
+    const float center_weight = RE_filter_value(type, 0.0f);
+    weights[0] = center_weight;
+    sum += center_weight;
+
+    /* Then, compute the weights along the positive x axis, making sure to add double the weight to
+     * the sum of weights because the filter is symmetric and we only loop over the positive half
+     * of the x axis. Skip the center weight already computed by dropping the front index. */
+    for (const int x : IndexRange(size.x).drop_front(1)) {
+      const float weight = RE_filter_value(type, x * scale.x);
+      weights[x] = weight;
+      sum += weight * 2.0f;
+    }
+
+    /* Then, compute the weights along the positive y axis, making sure to add double the weight to
+     * the sum of weights because the filter is symmetric and we only loop over the positive half
+     * of the y axis. Skip the center weight already computed by dropping the front index. */
+    for (const int y : IndexRange(size.y).drop_front(1)) {
+      const float weight = RE_filter_value(type, y * scale.y);
+      weights[size.x * y] = weight;
+      sum += weight * 2.0f;
+    }
+
+    /* Then, compute the other weights in the upper right quadrant, making sure to add quadruple
+     * the weight to the sum of weights because the filter is symmetric and we only loop over one
+     * quadrant of it. Skip the weights along the y and x axis already computed by dropping the
+     * front index. */
+    for (const int y : IndexRange(size.y).drop_front(1)) {
+      for (const int x : IndexRange(size.x).drop_front(1)) {
+        const float weight = RE_filter_value(type, math::length(float2(x, y) * scale));
+        weights[size.x * y + x] = weight;
+        sum += weight * 4.0f;
+      }
+    }
+
+    /* Finally, normalize the weights. */
+    for (const int y : IndexRange(size.y)) {
+      for (const int x : IndexRange(size.x)) {
+        weights[size.x * y + x] /= sum;
+      }
+    }
+
+    texture_ = GPU_texture_create_2d("Weights", size.x, size.y, 1, GPU_R16F, weights.data());
+
+    type_ = type;
+    radius_ = radius;
+  }
+
+  void bind_as_texture(GPUShader *shader, const char *texture_name)
+  {
+    const int texture_image_unit = GPU_shader_get_texture_binding(shader, texture_name);
+    GPU_texture_bind(texture_, texture_image_unit);
+  }
+
+  void unbind_as_texture()
+  {
+    GPU_texture_unbind(texture_);
+  }
+};
+
 class BlurOperation : public NodeOperation {
+ private:
+  /* Cached symmetric blur weights. */
+  SymmetricBlurWeights blur_weights_;
+  /* Cached symmetric blur weights for the separable horizontal pass. */
+  SymmetricSeparableBlurWeights blur_horizontal_weights_;
+  /* Cached symmetric blur weights for the separable vertical pass. */
+  SymmetricSeparableBlurWeights blur_vertical_weights_;
+
  public:
   using NodeOperation::NodeOperation;
 
   void execute() override
   {
-    get_input("Image").pass_through(get_result("Image"));
+    if (is_identity()) {
+      get_input("Image").pass_through(get_result("Image"));
+      return;
+    }
+
+    if (use_separable_filter()) {
+      GPUTexture *horizontal_pass_result = execute_separable_blur_horizontal_pass();
+      execute_separable_blur_vertical_pass(horizontal_pass_result);
+    }
+    else {
+      execute_blur();
+    }
+  }
+
+  void execute_blur()
+  {
+    GPUShader *shader = shader_manager().get("compositor_symmetric_blur");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1b(shader, "extend_bounds", get_extend_bounds());
+    GPU_shader_uniform_1b(shader, "gamma_correct", node_storage(bnode()).gamma);
+
+    const Result &input_image = get_input("Image");
+    input_image.bind_as_texture(shader, "input_tx");
+
+    blur_weights_.update(compute_blur_radius(), node_storage(bnode()).filtertype);
+    blur_weights_.bind_as_texture(shader, "weights_tx");
+
+    Domain domain = compute_domain();
+    if (get_extend_bounds()) {
+      /* Add a radius amount of pixels in both sides of the image, hence the multiply by 2. */
+      domain.size += int2(math::ceil(compute_blur_radius())) * 2;
+    }
+
+    Result &output_image = get_result("Image");
+    output_image.allocate_texture(domain);
+    output_image.bind_as_image(shader, "output_img");
+
+    compute_dispatch_threads_at_least(shader, domain.size);
+
+    GPU_shader_unbind();
+    output_image.unbind_as_image();
+    input_image.unbind_as_texture();
+    blur_weights_.unbind_as_texture();
+  }
+
+  GPUTexture *execute_separable_blur_horizontal_pass()
+  {
+    GPUShader *shader = shader_manager().get("compositor_symmetric_separable_blur");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1b(shader, "extend_bounds", get_extend_bounds());
+    GPU_shader_uniform_1b(shader, "gamma_correct_input", node_storage(bnode()).gamma);
+    GPU_shader_uniform_1b(shader, "gamma_uncorrect_output", false);
+
+    const Result &input_image = get_input("Image");
+    input_image.bind_as_texture(shader, "input_tx");
+
+    blur_horizontal_weights_.update(compute_blur_radius().x, node_storage(bnode()).filtertype);
+    blur_horizontal_weights_.bind_as_texture(shader, "weights_tx");
+
+    Domain domain = compute_domain();
+    if (get_extend_bounds()) {
+      domain.size.x += static_cast<int>(math::ceil(compute_blur_radius().x)) * 2;
+    }
+
+    /* We allocate an output image of a transposed size, that is, with a height equivalent to the
+     * width of the input and vice versa. This is done as a performance optimization. The shader
+     * will blur the image horizontally and write it to the intermediate output transposed. Then
+     * the vertical pass will execute the same horizontal blur shader, but since its input is
+     * transposed, it will effectively do a vertical blur and write to the output transposed,
+     * effectively undoing the transposition in the horizontal pass. This is done to improve
+     * spatial cache locality in the shader and to avoid having two separate shaders for each blur
+     * pass. */
+    const int2 transposed_domain = int2(domain.size.y, domain.size.x);
+
+    GPUTexture *horizontal_pass_result = texture_pool().acquire_color(transposed_domain);
+    const int image_unit = GPU_shader_get_texture_binding(shader, "output_img");
+    GPU_texture_image_bind(horizontal_pass_result, image_unit);
+
+    compute_dispatch_threads_at_least(shader, domain.size);
+
+    GPU_shader_unbind();
+    input_image.unbind_as_texture();
+    blur_horizontal_weights_.unbind_as_texture();
+    GPU_texture_image_unbind(horizontal_pass_result);
+
+    return horizontal_pass_result;
+  }
+
+  void execute_separable_blur_vertical_pass(GPUTexture *horizontal_pass_result)
+  {
+    GPUShader *shader = shader_manager().get("compositor_symmetric_separable_blur");
+    GPU_shader_bind(shader);
+
+    GPU_shader_uniform_1b(shader, "extend_bounds", get_extend_bounds());
+    GPU_shader_uniform_1b(shader, "gamma_correct_input", false);
+    GPU_shader_uniform_1b(shader, "gamma_uncorrect_output", node_storage(bnode()).gamma);
+
+    GPU_memory_barrier(GPU_BARRIER_TEXTURE_FETCH);
+    const int texture_image_unit = GPU_shader_get_texture_binding(shader, "input_tx");
+    GPU_texture_bind(horizontal_pass_result, texture_image_unit);
+
+    blur_vertical_weights_.update(compute_blur_radius().y, node_storage(bnode()).filtertype);
+    blur_vertical_weights_.bind_as_texture(shader, "weights_tx");
+
+    Domain domain = compute_domain();
+    if (get_extend_bounds()) {
+      /* Add a radius amount of pixels in both sides of the image, hence the multiply by 2. */
+      domain.size += int2(math::ceil(compute_blur_radius())) * 2;
+    }
+
+    Result &output_image = get_result("Image");
+    output_image.allocate_texture(domain);
+    output_image.bind_as_image(shader, "output_img");
+
+    /* Notice that the domain is transposed, see the note on the horizontal pass method for more
+     * information on the reasoning behind this. */
+    compute_dispatch_threads_at_least(shader, int2(domain.size.y, domain.size.x));
+
+    GPU_shader_unbind();
+    output_image.unbind_as_image();
+    blur_vertical_weights_.unbind_as_texture();
+    GPU_texture_unbind(horizontal_pass_result);
+  }
+
+  float2 compute_blur_radius()
+  {
+    const float size = math::clamp(get_input("Size").get_float_value_default(1.0f), 0.0f, 1.0f);
+
+    if (!node_storage(bnode()).relative) {
+      return float2(node_storage(bnode()).sizex, node_storage(bnode()).sizey) * size;
+    }
+
+    int2 image_size = get_input("Image").domain().size;
+    switch (node_storage(bnode()).aspect) {
+      case CMP_NODE_BLUR_ASPECT_Y:
+        image_size.y = image_size.x;
+        break;
+      case CMP_NODE_BLUR_ASPECT_X:
+        image_size.x = image_size.y;
+        break;
+      default:
+        BLI_assert(node_storage(bnode()).aspect == CMP_NODE_BLUR_ASPECT_NONE);
+        break;
+    }
+
+    return float2(image_size) * get_size_factor() * size;
+  }
+
+  /* Returns true if the operation does nothing and the input can be passed through. */
+  bool is_identity()
+  {
+    const Result &input = get_input("Image");
+    /* Single value inputs can't be blurred and are returned as is. */
+    if (input.is_single_value()) {
+      return true;
+    }
+
+    /* Zero blur radius. The operation does nothing and the input can be passed through. */
+    if (compute_blur_radius() == float2(0.0)) {
+      return true;
+    }
+
+    return false;
+  }
+
+  /* The blur node can operate with different filter types, evaluated on the normalized distance to
+   * the center of the filter. Some of those filters are separable and can be computed as such. If
+   * the bokeh member is disabled in the node, then the filter is always computed as separable even
+   * if it is not in fact separable, in which case, the used filter is a cheaper approximation to
+   * the actual filter. If the bokeh member is enabled, then the filter is computed as separable if
+   * it is in fact separable and as a normal 2D filter otherwise. */
+  bool use_separable_filter()
+  {
+    if (!node_storage(bnode()).bokeh) {
+      return true;
+    }
+
+    /* Both Box and Gaussian filters are separable. The rest is not. */
+    switch (node_storage(bnode()).filtertype) {
+      case R_FILTER_BOX:
+      case R_FILTER_GAUSS:
+      case R_FILTER_FAST_GAUSS:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  float2 get_size_factor()
+  {
+    return float2(node_storage(bnode()).percentx, node_storage(bnode()).percenty) / 100.0f;
+  }
+
+  bool get_extend_bounds()
+  {
+    return bnode().custom1 & CMP_NODEFLAG_BLUR_EXTEND_BOUNDS;
   }
 };
 
diff --git a/source/blender/render/intern/initrender.cc b/source/blender/render/intern/initrender.cc
index cc05aa8621e..1ea93cbf6c8 100644
--- a/source/blender/render/intern/initrender.cc
+++ b/source/blender/render/intern/initrender.cc
@@ -124,7 +124,8 @@ float RE_filter_value(int type, float x)
       }
       return 1.0f - x;
 
-    case R_FILTER_GAUSS: {
+    case R_FILTER_GAUSS:
+    case R_FILTER_FAST_GAUSS: {
       const float two_gaussfac2 = 2.0f * gaussfac * gaussfac;
       x *= 3.0f * gaussfac;
       return 1.0f / sqrtf((float)M_PI * two_gaussfac2) * expf(-x * x / two_gaussfac2);