libavfilter/x86/vf_gblur: add ff_verti_slice_avx2/512()

The new vertical slice with AVX2/512 acceleration can significantly improve the performance of Gaussian Filter 2D. Performance data: ff_verti_slice_c: 32.57 ff_verti_slice_avx2: 476.19 ff_verti_slice_avx512: 833.33 Co-authored-by: Cheng Yanfei <yanfei.cheng@intel.com> Co-authored-by: Jin Jun <jun.i.jin@intel.com> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
author: Wu Jianhua <jianhua.wu@intel.com> 2021-08-04 05:06:13 +0300
committer: Paul B Mahol <onemda@gmail.com> 2021-08-29 20:58:33 +0300
commit: 68a2722aee2868084ad3ba1a7a5431735eab049e (patch)
tree: 647893d1ea8883e2b9a15e707b12b6aeb1be6618 /libavfilter/x86
parent: 4a5e24721c2bd1839aec57730061884fe2c5dd3b (diff)
2 files changed, 196 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
index 276fe347f5..ac4debba74 100644
--- a/libavfilter/x86/vf_gblur.asm
+++ b/libavfilter/x86/vf_gblur.asm
@@ -22,6 +22,43 @@
 
 SECTION .text
 
+%xdefine AVX2_MMSIZE   32
+%xdefine AVX512_MMSIZE 64
+
+%macro MOVSXDIFNIDN 1-*
+    %rep %0
+        movsxdifnidn %1q, %1d
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro PUSH_MASK 5
+%if mmsize == AVX2_MMSIZE
+    %assign %%n mmsize/4
+    %assign %%i 0
+    %rep %%n
+        mov %4, %3
+        and %4, 1
+        neg %4
+        mov dword [%5 + %%i*4], %4
+        sar %3, 1
+        %assign %%i %%i+1
+    %endrep
+    movu %1, [%5]
+%else
+    kmovd %2, %3
+%endif
+%endmacro
+
+%macro VMASKMOVPS 4
+%if mmsize == AVX2_MMSIZE
+    vpmaskmovd %1, %3, %2
+%else
+    kmovw k7, %4
+    vmovups %1{k7}, %2
+%endif
+%endmacro
+
 ; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
 ;                          float nu, float bscale)
 
@@ -232,3 +269,155 @@ POSTSCALE_SLICE
 INIT_ZMM avx512
 POSTSCALE_SLICE
 %endif
+
+
+;*******************************************************************************
+; void ff_verti_slice(float *buffer, int width, int height, int column_begin,
+;                     int column_end, int steps, float nu, float bscale);
+;*******************************************************************************
+%macro VERTI_SLICE 0
+%if UNIX64
+cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
+                                         steps, x, y, cwidth, step, ptr, stride
+%else
+cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \
+                                         steps, nu, bscale, x, y, cwidth, step, \
+                                         ptr, stride
+%endif
+%assign cols mmsize/4
+%if WIN64
+    VBROADCASTSS m0, num
+    VBROADCASTSS m1, bscalem
+    DEFINE_ARGS buffer, width, height, cbegin, cend, \
+                steps, x, y, cwidth, step, ptr, stride
+    MOVSXDIFNIDN width, height, cbegin, cend, steps
+%else
+    VBROADCASTSS m0, xmm0 ; nu
+    VBROADCASTSS m1, xmm1 ; bscale
+%endif
+    mov cwidthq, cendq
+    sub cwidthq, cbeginq
+    lea strideq, [widthq * 4]
+
+    xor xq, xq ; x = 0
+    cmp cwidthq, cols
+    jl .x_scalar
+    cmp cwidthq, 0x0
+    je .end_scalar
+
+    sub cwidthq, cols
+.loop_x:
+    xor stepq, stepq
+    .loop_step:
+        ; ptr = buffer + x + column_begin;
+        lea ptrq, [xq + cbeginq]
+        lea ptrq, [bufferq + ptrq*4]
+
+        ;  ptr[15:0] *= bcale;
+        movu m2, [ptrq]
+        mulps m2, m1
+        movu [ptrq], m2
+
+        ; Filter downwards
+        mov yq, 1
+        .loop_y_down:
+            add ptrq, strideq ; ptrq += width
+            movu m3, [ptrq]
+            FMULADD_PS m2, m2, m0, m3, m2
+            movu [ptrq], m2
+
+            inc yq
+            cmp yq, heightq
+            jl .loop_y_down
+
+        mulps m2, m1
+        movu [ptrq], m2
+
+        ; Filter upwards
+        dec yq
+        .loop_y_up:
+            sub ptrq, strideq
+            movu m3, [ptrq]
+            FMULADD_PS m2, m2, m0, m3, m2
+            movu [ptrq], m2
+
+            dec yq
+            cmp yq, 0
+            jg .loop_y_up
+
+        inc stepq
+        cmp stepq, stepsq
+        jl .loop_step
+
+    add xq, cols
+    cmp xq, cwidthq
+    jle .loop_x
+
+    add cwidthq, cols
+    cmp xq, cwidthq
+    jge .end_scalar
+
+.x_scalar:
+    xor stepq, stepq
+    mov qword [rsp + 0x10], xq
+    sub cwidthq, xq
+    mov xq, 1
+    shlx cwidthq, xq, cwidthq
+    sub cwidthq, 1
+    PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20
+    mov xq, qword [rsp + 0x10]
+
+    .loop_step_scalar:
+        lea ptrq, [xq + cbeginq]
+        lea ptrq, [bufferq + ptrq*4]
+
+        VMASKMOVPS m2, [ptrq], m4, k1
+        mulps m2, m1
+        VMASKMOVPS [ptrq], m2, m4, k1
+
+        ; Filter downwards
+        mov yq, 1
+        .x_scalar_loop_y_down:
+            add ptrq, strideq
+            VMASKMOVPS m3, [ptrq], m4, k1
+            FMULADD_PS m2, m2, m0, m3, m2
+            VMASKMOVPS [ptrq], m2, m4, k1
+
+            inc yq
+            cmp yq, heightq
+            jl .x_scalar_loop_y_down
+
+        mulps m2, m1
+        VMASKMOVPS [ptrq], m2, m4, k1
+
+        ; Filter upwards
+        dec yq
+        .x_scalar_loop_y_up:
+            sub ptrq, strideq
+            VMASKMOVPS m3, [ptrq], m4, k1
+            FMULADD_PS m2, m2, m0, m3, m2
+            VMASKMOVPS [ptrq], m2, m4, k1
+
+            dec yq
+            cmp yq, 0
+            jg .x_scalar_loop_y_up
+
+        inc stepq
+        cmp stepq, stepsq
+        jl .loop_step_scalar
+
+.end_scalar:
+    RET
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VERTI_SLICE
+%endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+VERTI_SLICE
+%endif
+%endif
diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
index 34aba4ca6e..3e173410c2 100644
--- a/libavfilter/x86/vf_gblur_init.c
+++ b/libavfilter/x86/vf_gblur_init.c
@@ -31,6 +31,11 @@ void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min,
 void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
 void ff_postscale_slice_avx512(float *ptr, int length, float postscale, float min, float max);
 
+void ff_verti_slice_avx2(float *buffer, int width, int height, int column_begin, int column_end,
+                        int steps, float nu, float bscale);
+void ff_verti_slice_avx512(float *buffer, int width, int height, int column_begin, int column_end,
+                        int steps, float nu, float bscale);
+
 av_cold void ff_gblur_init_x86(GBlurContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -47,9 +52,11 @@ av_cold void ff_gblur_init_x86(GBlurContext *s)
     }
     if (EXTERNAL_AVX2(cpu_flags)) {
         s->horiz_slice = ff_horiz_slice_avx2;
+        s->verti_slice = ff_verti_slice_avx2;
     }
     if (EXTERNAL_AVX512(cpu_flags)) {
         s->postscale_slice = ff_postscale_slice_avx512;
+        s->verti_slice = ff_verti_slice_avx512;
     }
 #endif
 }
author	Wu Jianhua <jianhua.wu@intel.com>	2021-08-04 05:06:13 +0300
committer	Paul B Mahol <onemda@gmail.com>	2021-08-29 20:58:33 +0300
commit	68a2722aee2868084ad3ba1a7a5431735eab049e (patch)
tree	647893d1ea8883e2b9a15e707b12b6aeb1be6618 /libavfilter/x86
parent	4a5e24721c2bd1839aec57730061884fe2c5dd3b (diff)