Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/FFmpeg/FFmpeg.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Vignali <martin.vignali@gmail.com>2017-12-19 23:06:01 +0300
committerMartin Vignali <martin.vignali@gmail.com>2017-12-19 23:10:09 +0300
commitf181648176c0d93851d4a89410bbdd9c85e1fa7c (patch)
tree0c0255b9a6724cd990deea50880dbd6be6b3884f /libavfilter/x86
parenta4a4179e83e61a36cf6900a1ca2a61b9efb03350 (diff)
avfilter/x86/vf_hflip : add avx2 version for hflip_byte and hflip_short
Diffstat (limited to 'libavfilter/x86')
-rw-r--r--libavfilter/x86/vf_hflip.asm12
-rw-r--r--libavfilter/x86/vf_hflip_init.c20
2 files changed, 27 insertions, 5 deletions
diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm
index 82e1154d21..6bd1782da4 100644
--- a/libavfilter/x86/vf_hflip.asm
+++ b/libavfilter/x86/vf_hflip.asm
@@ -32,7 +32,7 @@ SECTION .text
;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
%macro HFLIP 3
cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
- mova m0, [pb_flip_%1]
+ VBROADCASTI128 m0, [pb_flip_%1]
xor xq, xq
%if %3 == 1
movsxdifnidn wq, wd
@@ -47,8 +47,13 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
.loop0:
neg xq
+%if mmsize == 32
+ vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load
+ vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
+%else
movu m1, [srcq + xq - mmsize + %3]
movu m2, [srcq + xq - 2 * mmsize + %3]
+%endif
pshufb m1, m0
pshufb m2, m0
neg xq
@@ -78,3 +83,8 @@ INIT_XMM ssse3
HFLIP byte, b, 1
HFLIP short, w, 2
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+HFLIP byte, b, 1
+HFLIP short, w, 2
+%endif
diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c
index 2b5c9d3bf3..0ac399b0d4 100644
--- a/libavfilter/x86/vf_hflip_init.c
+++ b/libavfilter/x86/vf_hflip_init.c
@@ -24,7 +24,9 @@
#include "libavfilter/hflip.h"
void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
{
@@ -32,10 +34,20 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
int i;
for (i = 0; i < nb_planes; i++) {
- if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 1) {
- s->flip_line[i] = ff_hflip_byte_ssse3;
- } else if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 2) {
- s->flip_line[i] = ff_hflip_short_ssse3;
+ if (step[i] == 1) {
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ s->flip_line[i] = ff_hflip_byte_ssse3;
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ s->flip_line[i] = ff_hflip_byte_avx2;
+ }
+ } else if (step[i] == 2) {
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ s->flip_line[i] = ff_hflip_short_ssse3;
+ }
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ s->flip_line[i] = ff_hflip_short_avx2;
+ }
}
}
}